# Cargar datos

In [None]:
import pandas as pd
import random

In [None]:
df_clean = pd.read_pickle('../../data/preprocessed/stemm_lemm_text_nanook.pkl')

In [None]:
df_clean[['Page Name', 'Message', 'Message_clean', 'Message_clean_stemm', 'Message_clean_lemm']].head()

In [None]:
df_clean.shape

In [None]:
len(set(df_clean['Facebook Id']))

El set de datos consta de $87,422$ posts de Facebook realizados por $20,415$ usuarios.

Dado que se han procesado los posts, a continuación se muestran los posts:
- Originales
- Limpios (sin *stop words*)
- *Lematizados*
- *Stemmizados*

In [None]:
print('ORIGINAL MESSAGE')
for i, text in enumerate(df_clean['Message']):
    print(f"Post {i + 1}: {text}")
    #display(Markdown(f"Post {i + 1}: {text}"))
    if i == 4:
        break
print("\n")

print('CLEAN MESSAGE')
for i, text in enumerate(df_clean['Message_clean']):
    print(f"Post {i + 1}: {text}")
    if i == 4:
        break
print("\n")

print('LEMMED MESSAGE')
for i, text in enumerate(df_clean['Message_clean_lemm']):
    print(f"Post {i + 1}: {text}")
    if i == 4:
        break
print("\n")

print('STEMMED MESSAGE')
for i, text in enumerate(df_clean['Message_clean_stemm']):
    print(f"Post {i + 1}: {text}")
    if i == 4:
        break
print("\n")

Guardar todos los posts en un archivo de texto plano (`facebook_posts.txt`) para procesarlos posteriormente con `spaCy`.

In [None]:
CLEAN_FACEBOOK_POSTS_PATH_FILE = "../../data/preprocessed/clean_posts.txt"

In [None]:
if 1 == 0:
    with open(CLEAN_FACEBOOK_POSTS_PATH_FILE, "w") as f:
        for i, post in enumerate(df_clean['Message_clean'].values.tolist()):
            f.write(post + "\n\n")
    print('Se escribireron {:,} posts de facebook en el archivo de texto plano {:}.'.format(i, CLEAN_FACEBOOK_POSTS_PATH_FILE.split('/')[-1]))

# `spaCy`

`spaCy` es una librería de Python utilizada para proyectos de **Procesamiento del Lenguaje Natural** (NLP) poniendo a la disposición del programador varias técnicas utilizadas en un pipeline de NLP:
- Tokenization
- Normalización del texto (eliminar mayúsculas, stemming, lemmatization)
- Part-Of-Speech tagging
- Named Entity Recognition (NER)

Para instalar `spaCy` y dependencias:
```
pip install spacy

pip install es-core-news-sm

python -m spacy download es_core_news_lg
```

In [None]:
import codecs
import itertools as it
import spacy

In [None]:
#nlp = spacy.load('es_core_news_sm')
nlp = spacy.load('es_core_news_lg')

Mostremos un posts de facebook.

In [None]:
with codecs.open(CLEAN_FACEBOOK_POSTS_PATH_FILE, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 4, 5))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print(sample_review)

Ahora trabajemos con `spaCy`

In [None]:
parsed_review = nlp(sample_review)

In [None]:
print(parsed_review)

Luce idéntico a la cadena sin utilizar `spaCy`, ¿cuál es la diferencia?

Gracias a `spaCy` es fácil obtener la tarea de preprocesamiento (realizada con anterioridad), así como más herramientas útiles en **NLP**:
- Part-of-Speech (POS)
- Lemmatization
- Stop words
- Named Entity Recognition (adjetivos, sustantivos, pronombres, ...)

In [None]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

In [None]:
for num, entity in enumerate(parsed_review.ents):
    print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print('')

In [None]:
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_lemma, token_shape),
             columns=['token_text', 'token_lemma', 'token_shape'])

In [None]:
for num, sentence in enumerate(parsed_review.sents):
    print('Sentence {}:'.format(num + 1))
    print(sentence)
    print('')

In [None]:
token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

¡Una de las ventajas de trabajar con `spaCy` es que podemos preprocesar los datos **más rápido**! Nos pudimos ahorrar el trabajo de procesamiento de texto utilizando los modelos precargados de `spaCy`.

In [None]:
token_attributes = [(token.orth_,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           ])

df.loc[:, 'stop?':'number?'] = df.loc[:, 'stop?':'number?'].map(lambda x: 'Yes' if x else '')
df

# Phrase modelling

In [None]:
import pandas as pd
#from gensim.models import Phrases
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import LineSentence

In [None]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace or stop word
    """
    
    return token.is_punct or token.is_space or token.is_stop

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),batch_size=1000, n_process=1):
        for sent in parsed_review.sents:
            yield ' '.join([token.lemma_ for token in sent if not punct_space(token)])

In [None]:
UNIGRAM_SENT_PATH = '../../data/preprocessed/unigram_sent.txt'

In [None]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
    with codecs.open(UNIGRAM_SENT_PATH, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(CLEAN_FACEBOOK_POSTS_PATH_FILE):
            f.write(sentence + '\n')

In [None]:
unigram_sentences = LineSentence(UNIGRAM_SENT_PATH)

In [None]:
df_clean = pd.read_pickle("/Users/eduardomorenoortiz/Desktop/ITAM/nanook/nlp_nanook/data/preprocessed/stemm_lemm_stop_words.pkl")

In [None]:
sent = [row.split() for row in df_clean['Message_clean_lemm_stpWrd']]

In [None]:
for i in sent[:2]:
    print(i)

In [None]:
len(sent)

In [None]:
phrases = Phrases(sent, min_count=30, threshold=30, progress_per=10000)

Observemos algunos textos lematizados:

In [None]:
for unigram_sentence in it.islice(unigram_sentences,  0, 5):
    print(' '.join(unigram_sentence))
    print('')

In [None]:
BIGRAM_MODEL_PATH = "../models/bigram_model"

In [None]:
SPANISH_CONNECTOR_WORDS = frozenset({ 
    'a', 'al', 'ante', 'bajo', 'cabe', 'con', 'contra', 'de', 'desde', 'durante', 'en', 'entre', 'hacia', 'hasta', 'para', 
    'por', 'según', 'sin', 'sobre', 'tras', 'versus', 'vía', 'o', 'u', 'y', 'ni', 'pero', 'mas', 'sino', 'aunque', 'si', 'pues', 
    'porque', 'que', 'donde', 'como', 'cuanto', 'cuales', 'quien', 'quienes', 'cual', 'cuál', 'cuáles', 'cuan', })

In [None]:
%%time
if 1 == 1:
    #bigram_model = Phrases(UNIGRAM_SENT_PATH, 
    #                        #min_count=1, 
    #                        #threshold=1, 
    #                        #connector_words=SPANISH_CONNECTOR_WORDS
    #                        )
    #bigram_model = Phrases(sent, min_count=30, threshold=30, progress_per=10000)
    #bigram_model = Phraser(bigram_model)
    #phrases = Phrases(sent, min_count=500, threshold=10)
    phrases = Phrases(sent, min_count=50, threshold=10, connector_words=SPANISH_CONNECTOR_WORDS)
    bigram_model = Phraser(phrases)
    bigram_model.save(BIGRAM_MODEL_PATH)
bigram_model = Phrases.load(BIGRAM_MODEL_PATH)

In [None]:
BIGRAM_SENT_PATH = "../../data/preprocessed/bigram_sents.txt"

In [None]:
%%time
if 1 == 1:
    with codecs.open(BIGRAM_SENT_PATH, 'w', encoding='utf_8') as f:
        for unigram_sentence in unigram_sentences:
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')

In [None]:
bigram_sentences = LineSentence(BIGRAM_SENT_PATH)

Ahora observemos algunos bigramas

In [None]:
for bigram_sentence in it.islice(bigram_sentences, 0, 10):
    print(' '.join(bigram_sentence))
    print('')

In [None]:
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

In [None]:
SPANISH_CONNECTOR_WORDS = frozenset({
    'a',
    'al',
    'ante',
    'bajo',
    'cabe',
    'con',
    'contra',
    'de',
    'desde',
    'durante',
    'en',
    'entre',
    'hacia',
    'hasta',
    'para',
    'por',
    'según',
    'sin',
    'sobre',
    'tras',
    'versus',
    'vía',
    'o',
    'u',
    'y',
    'ni',
    'pero',
    'mas',
    'sino',
    'aunque',
    'si',
    'pues',
    'porque',
    'que',
    'donde',
    'como',
    'cuanto',
    'cuales',
    'quien',
    'quienes',
    'cual',
    'cuál',
    'cuáles',
    'cuan',
    'cuanlo',
    'cuanlos',
    'cuanla',
    'cuanlas',
    'cuanle',
    'cuanles',
    'cuanse',
    'cuanme',
    'cuanmen',
    'cuanos',
    'cuanas',
    'cuanos',
    'cuanas',
    'cuanos',
    'cuanas',
    'cuanse',
    'cuanlos',
    'cuanlas',
    'cuanse',
    'cuanse',
    'cuanos',
    'cuanas',
    'cuanos',
    'cuanas',
    'cuanos',
    'cuanas',
    'cuanse',
    'cuanse',
    'cuanto',
    'cuanta',
    'cuantos',
    'cuantas',
    'cuanto',
    'cuanta',
    'cuantos',
    'cuantas',
    'cuanto',
    'cuanta',
    'cuantos',
    'cuantas',
    'cuanto',
    'cuanta',
    'cuantos',
    'cuantas',
    'cuanto',
    'cuanta',
    'cuantos',
    'cuantas',
})

In [None]:
# Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator).
sentences = Text8Corpus(datapath("/Users/eduardomorenoortiz/Desktop/ITAM/nanook/nlp_nanook/data/preprocessed/unigram_sent.txt"))

In [None]:
# Each sentence must be a list of string tokens:
first_sentence = next(iter(sentences))
print(first_sentence[:10])

In [None]:
# Train a toy phrase model on our training corpus.
phrase_model = Phrases(sentences, min_count=1, threshold=1, connector_words=SPANISH_CONNECTOR_WORDS)

In [None]:
# Apply the trained phrases model to a new, unseen sentence.
new_sentence = ['cambio', 'climatico', 'malo']
phrase_model[new_sentence]
#['trees_graph', 'minors']
# The toy model considered "trees graph" a single phrase => joined the two
# tokens into a single "phrase" token, using our selected `_` delimiter.


In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
df_clean = pd.read_pickle("/Users/eduardomorenoortiz/Desktop/ITAM/nanook/nlp_nanook/data/preprocessed/stemm_lemm_stop_words.pkl")

In [None]:
df_clean['Message_clean_lemm_stpWrd'][:10]

In [None]:
sent = [row.split() for row in df_clean['Message_clean_lemm_stpWrd']]

In [None]:
len(sent)

In [None]:
phrases = Phrases(sent, min_count=500, threshold=10)
bigram = Phraser(phrases)

In [None]:
bigram = Phraser(phrases)

In [None]:
bigram[sent[0]]

In [None]:
sentences = bigram[sent]

In [None]:
from collections import defaultdict

In [None]:
word_freq = defaultdict(int)
for sent_i in sentences:
    for i in sent_i:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

In [None]:
# Apply the trained model to each sentence of a corpus, using the same [] syntax:
for sent in phrase_model[sentences]:
    pass


In [None]:
from tqdm import tqdm

In [None]:
%time
tokens_spacy = []
for post in tqdm(facebook_post_clean[:4000]):
    doc = nlp(post)
    token_i = [token.lemma_ for token in doc if not token.is_stop]
    tokens_spacy.append(token_i)

print(tokens_spacy[:1])

In [None]:
len(tokens_spacy)

# N-grams

Es necesaria la versión `1.10.1` de `scipy`:

```
pip install scipy==1.10.1
```

In [None]:
import gensim

In [None]:
bigram = gensim.models.Phrases(tokens_spacy, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[tokens_spacy], threshold=100)  

In [None]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
print(trigram_mod[bigram_mod[tokens_spacy[0]]])

In [None]:
bigram_mod[tokens_spacy[0]]

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [None]:
data_words_bigrams = make_bigrams(tokens_spacy)

# Corpus

In [None]:
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
id2word[0]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# LDA

In [None]:
from pprint import pprint

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

# Model development

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Optimum number of topics

In [None]:
import matplotlib.pyplot as plt

In [None]:
%time
limit=40
start=2
step=6
coherence_values = []
model_list = []

for num_topics in tqdm(range(start, limit, step)):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
    model_list.append(lda_model)
    coherencemodel = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

In [None]:
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for n_top, coh_val in zip(x, coherence_values):
    print(f"{n_top}: {coh_val}")

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=14, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]