In [None]:
import pandas as pd
import nltk
import numpy as np
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from nltk.corpus import cess_esp
from collections import Counter
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

In [None]:
#nltk.download('omw')  # Descargar el recurso WordNet en español
#nltk.download('cess_esp')  # Descargar el corpus CESS en español (etiquetado POS)
#nltk.download('punkt')
#nltk.download('wordnet')

In [None]:
import sys
sys.path.insert(0, '/Users/eduardomorenoortiz/Desktop/ITAM/nanook/nlp_nanook/src') # LOCAL
from utils.utils import get_corpus_N_gram

### Manual configuration

In case `nltk.download('omw')`, `nltk.download('cess_esp')` and/or `nltk.download('punkt')` response were **False**: download following models from the next [link](http://www.nltk.org/nltk_data/):
- Model: *Extended Open Multilingual WordNet*, ID: *extended_omw*
- Model: *CESS-ESP Treebank*, ID: *cess_esp*
- Model: *Punkt Tokenizer Models*, ID: *punkt*
- Model: *WordNet*, ID: *wordnet*

Once files are downloaded move them to:
- *extended_omw*, *cess_esp*: */env_nanook/lib/nltk_data/tokenizers* 
- *punkt*: */env_nanook/lib/nltk_data/tokenizers*
- *wordnet*: */env_nanook/lib/nltk_data/tokenizers/corpora*

In [None]:
nltk.data.path.append('/Users/eduardomorenoortiz/Desktop/ITAM/nanook/nlp_nanook/env_nlp_nanook/lib/python3.11/site-packages/nltk/nltk_data') # Uncomment if necessary

# Load data

In [None]:
#df_clean = pd.read_pickle('../../data/preprocessed/clean_text_nanook.pkl') # Uncomment if first time running the notebook
df_clean = pd.read_pickle('../../data/preprocessed/stemm_lemm_text_nanook.pkl') # Comment if first time running the notebook

In [None]:
df_clean.head()

In [None]:
df_clean.shape

# Stemming

**Stemming** is a text normalization process used in *Natural Language Processing* (*NLP*) to reduce words to their base or root form, known as the **stem**. The goal of stemming is to obtain a common representation for variations of words that share the same root, even if that root may not be an actual word. The resulting stems may not always be semantically valid words but serve the purpose of grouping similar words together.

Stemming involves removing prefixes or suffixes from words to derive their root forms. The idea is to simplify words to their basic linguistic or morphological components. This process is particularly useful in tasks such as text analysis, information retrieval, and search engine optimization.

Key points about stemming:

1. Reduction of Inflected Words: Stemming reduces words to their base or root forms, removing variations caused by different tenses, pluralization, or other grammatical forms. For example, the stem of "running" is "run," and the stem of "happily" is "happi."

1. Heuristic-Based Approach: Stemming algorithms typically use heuristic rules to apply transformations to words. These rules are designed to strip common prefixes or suffixes, but they may not always result in a linguistically valid word.

1. Simplification of Vocabulary: Stemming helps simplify the vocabulary by treating different inflections of a word as the same entity. This can reduce the dimensionality of the data and improve the efficiency of text analysis.

1. Fast and Lightweight: Stemming is computationally less intensive than lemmatization, making it faster and more suitable for tasks where speed is crucial.

Here's a simple example in English:

- Word: "Running"
- Stem: "Run"

It's important to note that stemming does not consider the context or semantics of words. Different stemming algorithms may produce different results, and there might be cases where stemming produces stems that are not valid words or may not accurately reflect the intended meaning. Popular stemming algorithms include the Porter Stemmer and the Lancaster Stemmer.

Reference:
- Prompt: What is stemming? - [ChatGPT](https://chat.openai.com/)

In [None]:
stemmer = SnowballStemmer("spanish")

In [None]:
def realizar_stemming(texto):
    palabras = nltk.word_tokenize(texto, language='spanish')
    stems = [stemmer.stem(palabra) for palabra in palabras]
    return ' '.join(stems)

In [None]:
print(f"Original text: {df_clean['Message_clean'][0]}")
print(f"Stemmed text: {realizar_stemming(texto=df_clean['Message_clean'][0])}")
print('-'*30)
print(f"Original text: {df_clean['Message_clean'][23]}")
print(f"Stemmed  text: {realizar_stemming(texto=df_clean['Message_clean'][23])}")

In [None]:
#stemming = df_clean['Message_clean'].apply(lambda x: realizar_stemming(texto=x)) # Uncomment if first time running the notebook
#df_clean['Message_clean_stemm'] = stemming # Uncomment if first time running the notebook

# Lemmatization

**Lemmatization** is a linguistic process commonly used in *Natural Language Processing* (*NLP*) to reduce words to their base or root form, known as the **lemma**. The lemma represents the canonical, dictionary form of a word. Lemmatization is different from stemming, which involves removing prefixes or suffixes from a word to obtain its root form, even if that root form may not be an actual word.

The main goal of lemmatization is to group different inflected forms of a word so they can be analyzed as a single item. This process is crucial for tasks like text analysis, information retrieval, and language modeling. Lemmatization helps in reducing the dimensionality of the vocabulary and improving the accuracy of text analysis by focusing on the core meaning of words.

Here are some key points about lemmatization:

1. Word Normalization: Lemmatization performs a kind of word normalization by transforming words into their base or root form. For example, the lemma of the word "running" is "run," and the lemma of "better" is "good."

1. Context Preservation: Unlike stemming, lemmatization considers the context and meaning of a word before determining its base form. This helps in producing more accurate and meaningful results.

1. Dictionary-Based Approach: Lemmatization often relies on dictionaries or morphological analysis to map words to their lemmas. These dictionaries include information about the base forms of words and their grammatical properties.

1. Part-of-Speech Consideration: Lemmatization may take into account the part of speech (POS) of a word to determine its correct lemma. For example, the lemma of "better" as an adjective is "good," but as an adverb, it remains "better."

1. Improved Text Analysis: Lemmatization can improve the accuracy of text analysis tasks such as sentiment analysis, topic modeling, and information retrieval by reducing words to their essential forms.

Here's a simple example in English:

- Word: "Running"
- Lemma: "Run"

In the context of NLP, libraries like NLTK (Natural Language Toolkit) and spaCy provide lemmatization tools and resources for multiple languages. Lemmatization is a valuable preprocessing step in many NLP applications to enhance the understanding and analysis of textual data.


Reference:
- Prompt: What is lemmatization? - [ChatGPT](https://chat.openai.com/)

In [None]:
lematizador = WordNetLemmatizer()

def lematizar_texto(texto):
    palabras = nltk.word_tokenize(texto, language='spanish')
    lemas = [lematizador.lemmatize(palabra) for palabra in palabras]
    return ' '.join(lemas)

In [None]:
print(f"Original text: {df_clean['Message_clean'][0]}")
print(f"Lemmatized  text: {lematizar_texto(texto=df_clean['Message_clean'][0])}")
print('-'*30)
print(f"Original text: {df_clean['Message_clean'][23]}")
print(f"Lemmatized  text: {lematizar_texto(texto=df_clean['Message_clean'][23])}")
print('-'*30)
print(f"Original text: nota como la palabra graves tiene asociado el token grav en la frase menciono los graves conflictos, pero no para menciono los gravess conflictos")
print(f"Lemmatized  text: {lematizar_texto(texto='nota como la palabra graves tiene asociado el token grav en la frase menciono los graves conflictos, pero no para menciono los gravess conflictos')}")


In [None]:
#lemmatization = df_clean['Message_clean'].apply(lambda x: lematizar_texto(texto=x)) # Uncomment if first time running the notebook
#df_clean['Message_clean_lemm'] = lemmatization # Uncomment if first time running the notebook

# Stop words

In [None]:
with open('../utils/spanish_stopwords.txt', 'r') as archivo:
    stop_words = [linea.strip() for linea in archivo]
len(stop_words)

In [None]:
# Función para formatear las etiquetas del eje Y en formato abreviado (por ejemplo, 120k)
def formato_abreviado(valor, posicion):
    if valor >= 1000:
        return f'{int(valor/1000)}k'
    else:
        return int(valor)

In [None]:
#df_clean.to_pickle('/Users/eduardomorenoortiz/Desktop/repos/cdas_itam_nanook/data/preprocessed/stemm_lemm_text_nanook.pkl') # Uncomment if first time running the notebook

In [None]:
corpus_stemm = get_corpus_N_gram(list_text=df_clean['Message_clean_stemm'], stop_words=stop_words, ngram=1, show_plot=True)

In [None]:
corpus_lemm = get_corpus_N_gram(list_text=df_clean['Message_clean_lemm'], stop_words=stop_words, ngram=1, show_plot=True)

In [None]:
count_corpus_stem = Counter(corpus_stemm)
count_corpus_lemm = Counter(corpus_lemm)

In [None]:
count_corpus_stem = count_corpus_stem.most_common()
count_corpus_lemm = count_corpus_lemm.most_common()

In [None]:
print(len(count_corpus_stem))
print(len(count_corpus_lemm))

In [None]:
thresholds = []
n_words_stem = []
n_words_lem = []
for threshold in range(2, 100):
    thresholds.append(threshold)
    n_words_stem.append(len([palabra for palabra, frecuencia in count_corpus_stem.items() if frecuencia < threshold]))
    n_words_lem.append(len([palabra for palabra, frecuencia in count_corpus_lemm.items() if frecuencia < threshold]))

for threshold in range(2, 100):
    thresholds.append(threshold)
    n_words_stem.append(len(set(palabra for palabra, frecuencia in count_corpus_stem.items() if frecuencia < threshold)))
    n_words_lem.append(len(set(palabra for palabra, frecuencia in count_corpus_lemm.items() if frecuencia < threshold)))

In [None]:
df_threshold = pd.DataFrame({'threshold': thresholds, 'stemming':n_words_stem, 'lemmatization':n_words_lem})

In [None]:
df_threshold.head()

In [None]:
# Graficar las dos líneas usando seaborn
sns.lineplot(data=df_threshold, x='threshold', y='stemming', label='Lost Stemming')
sns.lineplot(data=df_threshold, x='threshold', y='lemmatization', label='Lost Lemmatization')
# Ajuste de labels en los ejes
plt.gca().yaxis.set_major_formatter(FuncFormatter(formato_abreviado))
# Threshold = 25
plt.axvline(x=25, color='red', linestyle='dashed', label='Threshold = 25')
# Ajustes adicionales, como etiquetas y leyenda
plt.xlabel('Threshold')
plt.ylabel('Lost')
plt.title('Lost of tokens')
plt.legend()
# Mostrar el gráfico
plt.show()

Notice that may exists some **logarithmic** pattern on both lines!!!

In [None]:
threshold = 500

In [None]:
lost_words_stem = [palabra for palabra, frecuencia in count_corpus_stem.items() if frecuencia < threshold]
lost_words_lemm = [palabra for palabra, frecuencia in count_corpus_lemm.items() if frecuencia < threshold]

In [None]:
#count_corpus_lemm

In [None]:
print("Lost tokens")
print(f"Stemming: {len(lost_words_stem)}")
print(f"Lemmatization: {len(lost_words_lemm)}")

In [None]:
words_stem = [palabra for palabra, frecuencia in count_corpus_stem.items() if frecuencia >= threshold]
words_lemm = [palabra for palabra, frecuencia in count_corpus_lemm.items() if frecuencia >= threshold]

In [None]:
count_corpus_stem[0]

In [None]:
with open('../data/corpus/corpus_counter_stemming.txt', 'w') as f:
    for item_i in count_corpus_stem:
        f.write(f"{item_i[0]}: {item_i[1]}\n")

In [None]:
with open('../data/corpus/corpus_counter_lemmatization.txt', 'w') as f:
    for item_i in count_corpus_lemm:
        f.write(f"{item_i[0]}: {item_i[1]}\n")

We will select words that appear at least `threshold` times.

In [None]:
print(f"threshold: {threshold}")

In [None]:
words_stem = [palabra for palabra, frecuencia in count_corpus_stem.items() if frecuencia >= threshold]
words_lemm = [palabra for palabra, frecuencia in count_corpus_lemm.items() if frecuencia >= threshold]

In [None]:
print(f"Number of words by Stemming: {len(words_stem)}")
print(f"Number of words by Lemmatization: {len(words_lemm)}")