<a href="https://colab.research.google.com/github/FernandoBRdgz/inteligencia_artificial/blob/main/incrustaciones_de_palabras/word2vec_yelp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Introducción

El conjunto de datos de Yelp es un subconjunto de nuestros negocios, reseñas y datos de usuario para su uso con fines personales, educativos y académicos. Disponible como archivos JSON, úselo para enseñar a los estudiantes acerca de las bases de datos, para aprender NLP o para obtener datos de producción de muestra mientras aprende a crear aplicaciones móviles.

Enlace al conjunto de datos: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
from pprint import pprint

In [None]:
main_path = '/content/drive/MyDrive/'

In [None]:
data_directory = os.path.join(main_path, 'data', 'yelp_dataset')

In [None]:
businesses_filepath = os.path.join(data_directory, 'yelp_academic_dataset_business.json')

In [None]:
with open(businesses_filepath) as f:
    first_business_record = f.readline() 

pprint(first_business_record)

In [None]:
review_json_filepath = os.path.join(data_directory, 'yelp_academic_dataset_review.json')

In [None]:
with open(review_json_filepath) as f:
    first_review_record = f.readline()
    
pprint(first_review_record)

In [None]:
restaurant_ids = set()

with open(businesses_filepath) as f:    
    for business_json in f:
        business = json.loads(business_json)
        if not business.get('categories'):
            continue
        if 'Restaurants' not in business['categories']:
            continue
        restaurant_ids.add(business['business_id'])

restaurant_ids = frozenset(restaurant_ids)

pprint(f'{len(restaurant_ids):,} restaurants in the dataset.')

In [None]:
scratch_directory = os.path.join(data_directory, 'scratch')

try:
    os.mkdir(scratch_directory)
except FileExistsError:
    pass

review_txt_filepath = os.path.join(scratch_directory, 'review_text_all.txt')

In [None]:
%%time
execute = False

if execute:
    review_count = 0
    with open(review_txt_filepath, 'w') as review_txt_file:
        with open(review_json_filepath) as review_json_file:
            for review_json in review_json_file:
                review = json.loads(review_json)
                if review['business_id'] not in restaurant_ids:
                    continue
                review_txt_file.write(review['text'].replace('\n', '\\n') + '\n')
                review_count += 1
    print(f'Text from {review_count:,} restaurant reviews written to the new txt file.')
    
else:
    with open(review_txt_filepath) as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print(f'Text from {review_count + 1:,} restaurant reviews in the txt file.')

In [None]:
import spacy
from spacy import displacy
import pandas as pd
import itertools as it

In [None]:
!python -m spacy download en_core_web_md

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
review_num = 42

with open(review_txt_filepath) as f:
    sample_review = list(it.islice(f, review_num, review_num+1))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print(sample_review)

In [None]:
%%time
parsed_review = nlp(sample_review)

In [None]:
print(parsed_review)

In [None]:
displacy.render(parsed_review, style="ent", jupyter=True)

In [None]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import LineSentence

In [None]:
def punct_space(token):
    return token.is_punct or token.is_space

def pronoun_lemmatize(token):
    if token.lemma_ == '-PRON-':
        return token.lower_
    
    else:
        return token.lemma_.lower()

def line_review(filename):
    with open(filename) as f:
        for review in f:
            yield review.replace('\\n', '\n')

In [None]:
review_lemmatized_filepath = os.path.join(scratch_directory, 'review_lemmatized_all.txt')
sentence_lemmatized_filepath = os.path.join(scratch_directory, 'sentence_lemmatized_all.txt')

In [None]:
%%time
execute = False

if execute:
    with open(review_lemmatized_filepath, 'w') as review_file:
        with open(sentence_lemmatized_filepath, 'w') as sentence_file:
            pipe = nlp.pipe(
                line_review(review_txt_filepath),
                batch_size=5000
                )
            
            for parsed_review in pipe:
                lemmatized_review = ' '.join([
                    pronoun_lemmatize(token)
                    for token in parsed_review
                    if not punct_space(token)
                    ])
                
                review_file.write(lemmatized_review + '\n')
        
                for sent in parsed_review.sents:
                    lemmatized_sentence = ' '.join([
                        pronoun_lemmatize(token)
                        for token in sent
                        if not punct_space(token)
                        ])
                    
                    sentence_file.write(lemmatized_sentence + '\n')

In [None]:
sentences_unigrams = LineSentence(sentence_lemmatized_filepath)

In [None]:
for sentence_unigrams in it.islice(sentences_unigrams, 60, 70):
    print(' '.join(sentence_unigrams))
    print('')

In [None]:
bigram_model_filepath = os.path.join(scratch_directory, 'bigram_phrase_model')

In [None]:
%%time
execute = False

if execute:

    bigram_phrases = Phrases(sentences_unigrams)
    bigram_phrases = Phraser(bigram_phrases)
    bigram_phrases.save(bigram_model_filepath)

In [None]:
bigram_phrases = Phraser.load(bigram_model_filepath)

In [None]:
sentences_bigrams_filepath = os.path.join(scratch_directory, 'sentence_bigram_phrases_all.txt')

In [None]:
%%time
execute = False
if execute:
    with open(sentences_bigrams_filepath, 'w') as f:
        for sentence_unigrams in sentences_unigrams:
            sentence_bigrams = ' '.join(bigram_phrases[sentence_unigrams])
            f.write(sentence_bigrams + '\n')

In [None]:
sentences_bigrams = LineSentence(sentences_bigrams_filepath)

In [None]:
for sentence_bigrams in it.islice(sentences_bigrams, 60, 70):
    print(' '.join(sentence_bigrams))
    print('')

In [None]:
trigram_model_filepath = os.path.join(scratch_directory, 'trigram_phrase_model')

In [None]:
%%time
execute = False

if execute:

    trigram_phrases = Phrases(sentences_bigrams)
    trigram_phrases = Phraser(trigram_phrases)
    trigram_phrases.save(trigram_model_filepath)

In [None]:
trigram_phrases = Phraser.load(trigram_model_filepath)

In [None]:
sentences_trigrams_filepath = os.path.join(scratch_directory, 'sentence_trigram_phrases_all.txt')

In [None]:
%%time
execute = False

if execute:
    with open(sentences_trigrams_filepath, 'w') as f:
        for sentence_bigrams in sentences_bigrams:
            sentence_trigrams = ' '.join(trigram_phrases[sentence_bigrams])
            f.write(sentence_trigrams + '\n')

In [None]:
sentences_trigrams = LineSentence(sentences_trigrams_filepath)

In [None]:
for sentence_trigrams in it.islice(sentences_trigrams, 60, 70):
    print(' '.join(sentence_trigrams))
    print('')

In [None]:
review_trigrams_filepath = os.path.join(scratch_directory, 'review_trigrams_all.txt')

In [None]:
%%time
execute = False

if execute:
    reviews_lemmatized = LineSentence(review_lemmatized_filepath)

    with open(review_trigrams_filepath, 'w') as f:
        
        for review_unigrams in reviews_lemmatized:
            review_bigrams = bigram_phrases[review_unigrams]
            review_trigrams = trigram_phrases[review_bigrams]

            review_trigrams = [
                term
                for term in review_trigrams
                if term not in nlp.Defaults.stop_words
                ]

            review_trigrams = ' '.join(review_trigrams)
            f.write(review_trigrams + '\n')

In [None]:
review_num = 0

print('Original:' + '\n')

for review in it.islice(line_review(review_txt_filepath), review_num, review_num+1):
    print(review)

print('----' + '\n')
print('Transformed:' + '\n')

with open(review_trigrams_filepath) as f:
    for review in it.islice(f, review_num, review_num+1):
        print(review)

In [None]:
!pip install pyLDAvis==2.1.2

In [None]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [None]:
dictionary_filepath = os.path.join(scratch_directory, 'trigram_dict_all.dict')

In [None]:
%%time
execute = False

if execute:
    reviews_trigrams = LineSentence(review_trigrams_filepath)
    dictionary_trigrams = Dictionary(reviews_trigrams)
    dictionary_trigrams.filter_extremes(no_below=20, no_above=0.4)
    dictionary_trigrams.compactify()
    dictionary_trigrams.save(dictionary_filepath)  

In [None]:
dictionary_trigrams = Dictionary.load(dictionary_filepath)

In [None]:
bow_corpus_filepath = os.path.join(scratch_directory, 'bow_trigrams_corpus_all.mm')

In [None]:
def bow_generator(filepath):
   
    for review in LineSentence(filepath):
        yield dictionary_trigrams.doc2bow(review)

In [None]:
%%time
execute = False

if execute:
    MmCorpus.serialize(bow_corpus_filepath, bow_generator(review_trigrams_filepath))

In [None]:
trigram_bow_corpus = MmCorpus(bow_corpus_filepath)

In [None]:
lda_model_filepath = os.path.join(scratch_directory, 'lda_model_all')

In [None]:
%%time
execute = False

if execute:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda = LdaMulticore(trigram_bow_corpus, num_topics=50, id2word=dictionary_trigrams, workers=7)
    
    lda.save(lda_model_filepath)

In [None]:
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
def explore_topic(topic_number, topn=25):
    print(f'{"term":20} {"frequency"}' + '\n')

    for term, frequency in lda.show_topic(topic_number, topn=topn):
        print(f'{term:20} {round(frequency, 3):.3f}')

In [None]:
explore_topic(topic_number=20, topn=5)

In [None]:
topic_names = {
    0: 'place1',
    1: 'sauce',
    2: 'place2',
    3: 'time',
    4: 'service',
    5: 'seafood1',
    6: 'reservation',
    7: 'taste',
    8: 'donut',
    9: 'vietnam',
    10: 'orders1',
    11: 'time',
    12: 'salad',
    13: 'order2', #
    14: 'burgers & fries',
    15: 'mexican',
    16: 'order3',
    17: 'seafood2',
    18: 'staff',
    19: 'atmosphere',
    20: 'chip',
    21: 'bar vibe', #
    22: 'meal experience', #
    23: 'slow service',
    24: 'brunch',
    25: 'portion sizes',
    26: 'beer, wings, sports',
    27: 'breakfast',
    28: 'miscellaneous',
    29: 'non-English',
    30: 'deli',
    31: 'barbecue',
    32: 'local business',
    33: 'miscellaneous',
    34: 'hole-in-the-wall',
    35: 'asian',
    36: 'specials',
    37: 'coffeeshop',
    38: 'prices',
    39: 'flavor & texture',
    40: 'noodles',
    41: 'canadian',
    42: 'highly recommended',
    43: 'sushi',
    44: 'ordering',
    45: 'mediterranean',
    46: 'decent value',
    47: 'cleanliness',
    48: 'lobster',
    49: 'seafood'
    }

In [None]:
topic_names_filepath = os.path.join(scratch_directory, 'topic_names.pkl')

with open(topic_names_filepath, 'wb') as f:
    pickle.dump(topic_names, f)

In [None]:
LDAvis_data_filepath = os.path.join(scratch_directory, 'ldavis_prepared')

**Por hacer**

* Añadir comentarios
* Incrustaciones de palabra con Word2vec
* Visualizaciones
* Álgebra de palabras

**Referencias**

* https://spacy.io/
* https://radimrehurek.com/gensim/
* https://github.com/pwharrison/modern-nlp-in-python-2019/blob/master/notebooks/Modern_NLP_in_Python.ipynb