# `spaCy`

`spaCy` es una librería de Python utilizada para proyectos de **Procesamiento del Lenguaje Natural** (NLP) poniendo a la disposición del programador varias técnicas utilizadas en un pipeline de NLP:
- Tokenization
- Normalización del texto (eliminar mayúsculas, stemming, lemmatization)
- Part-Of-Speech tagging
- Named Entity Recognition (NER)

Para instalar `spaCy` y dependencias:
```
pip install spacy

pip install es-core-news-sm
```

In [None]:
import itertools as it
import pandas as pd
import spacy

In [None]:
nlp = spacy.load('es_core_news_sm')

# Load data

In [None]:
import pandas as pd
import random

In [None]:
df_clean = pd.read_pickle('/Users/eduardomorenoortiz/Desktop/ITAM/nanook/nlp_nanook/data/preprocessed/stemm_lemm_text_nanook.pkl')

In [None]:
df_clean[['Page Name', 'Message', 'Message_clean', 'Message_clean_stemm', 'Message_clean_lemm']].head()

In [None]:
df_clean.shape

In [None]:
print('ORIGINAL MESSAGE')
for i, text in enumerate(df_clean['Message']):
    print(f"Post {i + 1}: {text}")
    #display(Markdown(f"Post {i + 1}: {text}"))
    if i == 4:
        break
print("\n")

print('CLEAN MESSAGE')
for i, text in enumerate(df_clean['Message_clean']):
    print(f"Post {i + 1}: {text}")
    if i == 4:
        break
print("\n")

print('LEMMED MESSAGE')
for i, text in enumerate(df_clean['Message_clean_lemm']):
    print(f"Post {i + 1}: {text}")
    if i == 4:
        break
print("\n")

print('STEMMED MESSAGE')
for i, text in enumerate(df_clean['Message_clean_stemm']):
    print(f"Post {i + 1}: {text}")
    if i == 4:
        break
print("\n")

In [None]:
facebook_post_clean = df_clean['Message_clean'].values.tolist()
facebook_post_clean[:5]

# Preprocess data: Tokenize by lemmatize text

In [None]:
from tqdm import tqdm

In [None]:
%time
tokens_spacy = []
for post in tqdm(facebook_post_clean[:4000]):
    doc = nlp(post)
    token_i = [token.lemma_ for token in doc if not token.is_stop]
    tokens_spacy.append(token_i)

print(tokens_spacy[:1])

In [None]:
len(tokens_spacy)

# N-grams

Es necesaria la versión `1.10.1` de `scipy`:

```
pip install scipy==1.10.1
```

In [None]:
import gensim

In [None]:
bigram = gensim.models.Phrases(tokens_spacy, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[tokens_spacy], threshold=100)  

In [None]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
print(trigram_mod[bigram_mod[tokens_spacy[0]]])

In [None]:
bigram_mod[tokens_spacy[0]]

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [None]:
data_words_bigrams = make_bigrams(tokens_spacy)

# Corpus

In [None]:
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
id2word[0]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# LDA

In [None]:
from pprint import pprint

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

# Model development

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Optimum number of topics

In [None]:
import matplotlib.pyplot as plt

In [None]:
%time
limit=40
start=2
step=6
coherence_values = []
model_list = []

for num_topics in tqdm(range(start, limit, step)):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
    model_list.append(lda_model)
    coherencemodel = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

In [None]:
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for n_top, coh_val in zip(x, coherence_values):
    print(f"{n_top}: {coh_val}")

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=14, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]