# Word cloud
---

Experiment with generating a word cloud from text. It should include a cleanup of the text, excluding stop words and punctuation, as well as setting everything in lowercase.

## Setup

### Import libraries

In [None]:
import os
from pathlib import Path
from IPython.display import Markdown, display
from lingua import LanguageDetectorBuilder
import pycountry
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import simplemma
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
from polids.word_cloud.wordcloud import WordCloudGenerator

### Set parameters

In [None]:
os.listdir()

In [None]:
os.chdir("..")

In [None]:
os.listdir()

In [None]:
human_annotated_data_path = Path("data/portugal_2022/programs/")
# Using this PDF which had issues with stopwords before (e.g. a lot of "se")
human_annotated_md = human_annotated_data_path / "be.md"

## Load data

In [None]:
markdown_content = human_annotated_md.read_text()
display(Markdown(markdown_content))

## Clean up the text

### Detect the language

In [None]:
# Include only languages that are not yet extinct (= currently excludes Latin)
language_detector = LanguageDetectorBuilder.from_all_spoken_languages().build()
language_detection_result = language_detector.detect_language_of(markdown_content)
detected_language = language_detection_result.name.lower()
print(f"Detected language: {detected_language}")

In [None]:
detected_language_code = pycountry.languages.get(
    name=detected_language.capitalize()
).alpha_2
print(f"Detected language code: {detected_language_code}")

### Remove stop words and lemmatize

In [None]:
nltk.download("stopwords")
nltk.download("punkt_tab")

In [None]:
stop_words = set(stopwords.words(detected_language))
print(f"Stop words in {detected_language}:\n{', '.join(stop_words)}")

In [None]:
word_tokens = word_tokenize(markdown_content)
# Lowercase and trim words
word_tokens_clean = [w.lower().strip() for w in word_tokens]
# Filter and clean words
filtered_words = [
    # Convert to lower case and lemmatize the words
    simplemma.lemmatize(w.lower(), lang=detected_language_code)
    for w in word_tokens_clean
    # Remove stopwords
    if w.lower() not in stop_words
    # Remove punctuation and markdown symbols
    and w not in string.punctuation
    and not w.startswith("#")
    # Remove words with numbers
    and not any(char.isdigit() for char in w)
]
# Stem words if they have a special character
stemmer = SnowballStemmer(detected_language)
filtered_words = [
    stemmer.stem(w) if any(char in string.punctuation for char in w) else w
    for w in filtered_words
]
# Remove words with less than 3 characters
filtered_words = [w for w in filtered_words if len(w) > 2]
print(f"Filtered words:\n{', '.join(filtered_words)}")

## Generate the word cloud

In [None]:
def get_word_cloud(
    words: list[str],
    max_words: int = 500,
    image_path: str | None = None,
    image_name: str | None = None,
):
    """
    Create a word cloud based on a set of words.

    Args:
        words (list[str]):
            List of words to be included in the word cloud.
        max_words (int):
            Maximum number of words to be included in the word cloud.
        image_path (str):
            Path to the image file where to save the word cloud.
        image_name (str):
            Name of the image where to save the word cloud.
    """

    # Change the value to black
    def black_color_func(
        word, font_size, position, orientation, random_state=None, **kwargs
    ):
        return "hsl(0,100%, 1%)"

    # Set the wordcloud background color to white
    # Set width and height to higher quality, 3000 x 2000
    wordcloud = WordCloud(
        background_color="white",
        width=3000,
        height=2000,
        max_words=max_words,
        stopwords=None,  # We already filtered the stopwords
        regexp=None,  # Just split on whitespace
        min_word_length=3,  # Drop words with less than 3 characters
    ).generate(" ".join(words))
    # Set the word color to black
    wordcloud.recolor(color_func=black_color_func)
    # Set the figsize
    plt.figure(figsize=[15, 10])
    # Plot the wordcloud
    plt.imshow(wordcloud, interpolation="bilinear")
    # Remove plot axes
    plt.axis("off")
    if image_path is not None and image_name is not None:
        # Save the image
        plt.savefig(os.path.join(image_path, image_name), bbox_inches="tight")

In [None]:
get_word_cloud(filtered_words)

## Implemented solution

In [None]:
word_cloud_generator = WordCloudGenerator(text=markdown_content)
word_cloud_generator.generate_word_cloud(image_path=None, image_name=None)