# Word cloud
---

Experiment with generating a word cloud from text. It should include a cleanup of the text, excluding stop words and punctuation, as well as setting everything in lowercase.

## Setup

### Import libraries

In [None]:
import os
from pathlib import Path
from IPython.display import Markdown, display
from tqdm.auto import tqdm
from lingua import LanguageDetectorBuilder
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pydantic import BaseModel, Field
from openai import OpenAI

In [None]:
from polids.config import settings

### Set parameters

In [None]:
os.listdir()

In [None]:
os.chdir("..")

In [None]:
os.listdir()

In [None]:
human_annotated_data_path = Path("data/portugal_2022/programs/")
# Using this PDF which had issues with stopwords before (e.g. a lot of "se")
human_annotated_md = human_annotated_data_path / "be.md"

## Load data

In [None]:
markdown_content = human_annotated_md.read_text()
display(Markdown(markdown_content))

## Define lemmatization step

Lemmatization is the process of reducing a word to its base or root form. For example, "running" becomes "run", and "better" becomes "good". This is useful for word cloud generation as it can group together similar words and reduce the overall number of unique words in the text.

In [None]:
openai_client = OpenAI(api_key=settings.openai_api_key)

In [None]:
class LemmatizedWord(BaseModel):
    """
    Represents the lemmatized form of a word, derived from its context and language.
    Designed for structured output from an LLM.
    """

    lemma: str = Field(
        description="The lemmatized (base/dictionary) form of the input word, determined by its context and language rules. If the word is unknown, a proper noun without a standard lemma, or already in its base form, return the original word."
    )
    part_of_speech: str = Field(
        description="The inferred part of speech (e.g., NOUN, VERB, ADJ, PROPN, UNKNOWN) based on the context that justifies the lemma. Use UNKNOWN if the POS cannot be determined reliably.",
    )


def lemmatize_word(
    word_to_lemmatize: str,
    text_chunk: str,
    language: str,
    client: OpenAI,
    llm_name: str = "gpt-4.1-nano-2025-04-14",
    temperature: float = 0.0,
    seed: int = 42,
) -> LemmatizedWord:
    """
    Finds the lemma of a word given its context and language.

    Args:
        word_to_lemmatize (str): The specific word to lemmatize.
        text_chunk (str): The surrounding text context for the word.
        language (str): The language of the word and text chunk (e.g., 'English', 'Spanish').
        client (OpenAI): An initialized OpenAI client instance.
        llm_name (str, optional): The OpenAI GPT-4.1 model to use. Defaults to "gpt-4.1-nano-2025-04-14".
        temperature (float, optional): Sampling temperature for the model. Defaults to 0.0.
        seed (int, optional): Seed for reproducibility. Defaults to 42.

    Returns:
        LemmatizedWord: A LemmatizedWord object containing the lemma and part of speech.

    Raises:
        Exception: If an error occurs during LLM processing.
        AssertionError: If the output does not match the LemmatizedWord schema.
    """

    # System prompt structured according to GPT-4.1 guide recommendations
    system_prompt = """
# Role and Objective
You are an expert multilingual computational linguist. Your objective is to accurately lemmatize a given <word> based on its <context> and <language>, strictly adhering to the provided instructions and output format.

# Instructions
- Analyze the provided <word> within its surrounding <context>.
- Determine the word's intended meaning and part of speech (POS) based *only* on the provided <context> and the rules of the specified <language>.
- Output the correct lemma (dictionary/base form) for the <word>.
- If the <word> is already in its lemma form (e.g., "meeting" used as a noun), return the word itself as the lemma.
- If the <word> is a proper noun (e.g., "London"), return the word itself as the lemma and identify the POS as 'PROPN'.
- If the <word> appears to be an unknown word, typo, or cannot be reliably lemmatized using the context, return the original <word> as the lemma and set POS to 'UNKNOWN'.
- The lemmatization must be specific to the given <language>. For example, "better" (English) -> "good"; "hablaba" (Spanish) -> "hablar".
- Follow the reasoning steps outlined below.
- Generate output *only* in the specified structured format. Do not add any extra explanations or conversational text.

# Reasoning Steps
1.  **Identify Inputs:** Note the specific <word>, <context>, and <language>.
2.  **Contextual Analysis:** Read the <context> carefully to understand how the <word> is used.
3.  **POS Tagging:** Determine the most likely part of speech (POS) of the <word> in this specific context (e.g., VERB, NOUN, ADJ, ADV, PROPN).
4.  **Lemmatization Rule Application:** Apply the lemmatization rules for the identified POS in the specified <language> to find the base/dictionary form (lemma).
5.  **Handle Edge Cases:** Check if the word is a proper noun, already a lemma, or unknown/unclear. Adjust lemma and POS accordingly (using 'PROPN' or 'UNKNOWN' for POS if applicable, and returning the original word as lemma in these cases).
6.  **Format Output:** Construct the final output strictly according to the `LemmatizedWord` schema.

# Output Format
Provide the result as a JSON object conforming to the `LemmatizedWord` schema. Ensure the `original_word` field exactly matches the input <word>. The required fields are:
- `lemma`: string (The determined lemma)
- `part_of_speech`: string or null (e.g., "NOUN", "VERB", "ADJ", "PROPN", "UNKNOWN")
- `original_word`: string (The exact input word)

# Examples
<example>
<input>
  <word>running</word>
  <context>He is running quickly.</context>
  <language>English</language>
</input>
<output>
  {{
    "lemma": "run",
    "part_of_speech": "VERB",
    "original_word": "running"
  }}
</output>
</example>

<example>
<input>
  <word>meetings</word>
  <context>We hold weekly meetings.</context>
  <language>English</language>
</input>
<output>
  {{
    "lemma": "meeting",
    "part_of_speech": "NOUN",
    "original_word": "meetings"
  }}
</output>
</example>

<example>
<input>
  <word>Paris</word>
  <context>She traveled to Paris.</context>
  <language>English</language>
</input>
<output>
  {{
    "lemma": "Paris",
    "part_of_speech": "PROPN",
    "original_word": "Paris"
  }}
</output>
</example>

<example>
<input>
  <word>corpora</word>
  <context>Linguistic analysis often involves large text corpora.</context>
  <language>English</language>
</input>
<output>
  {{
    "lemma": "corpus",
    "part_of_speech": "NOUN",
    "original_word": "corpora"
  }}
</output>
</example>

<example>
<input>
  <word>chevaux</word>
  <context>Les chevaux sauvages galopaient.</context>
  <language>French</language>
</input>
<output>
  {{
    "lemma": "cheval",
    "part_of_speech": "NOUN",
    "original_word": "chevaux"
  }}
</output>
</example>

Now, apply these instructions and reasoning steps meticulously to the user's input.
"""

    # User prompt using clear delimiters
    user_prompt = f"""Please perform lemmatization according to the system instructions based on the following details:

<input>
  <language>{language}</language>
  <word>{word_to_lemmatize}</word>
  <context>{text_chunk}</context>
</input>

Provide the output in the specified structured JSON format.
"""

    try:
        completion = client.beta.chat.completions.parse(
            model=llm_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            response_format=LemmatizedWord,  # Enforce the schema
            temperature=temperature,
            seed=seed,
        )

        lemmatized_result = completion.choices[0].message.parsed

        # Validate the output type
        assert isinstance(lemmatized_result, LemmatizedWord), (
            f"Output failed Pydantic validation. Expected LemmatizedWord, "
            f"got {type(lemmatized_result)}"
        )

        return lemmatized_result

    except Exception as e:
        print(f"An error occurred during LLM processing: {e}")
        # Fallback strategy: Return the original word as lemma with UNKNOWN POS
        return LemmatizedWord(
            lemma=word_to_lemmatize,
            part_of_speech="UNKNOWN",  # Indicate failure/uncertainty via POS
        )

## Clean up the text

### Detect the language

In [None]:
# Include only languages that are not yet extinct (= currently excludes Latin)
language_detector = LanguageDetectorBuilder.from_all_spoken_languages().build()
language_detection_result = language_detector.detect_language_of(markdown_content)
detected_language = language_detection_result.name.lower()
print(f"Detected language: {detected_language}")

### Remove stop words and lemmatize

In [None]:
nltk.download("stopwords")
nltk.download("punkt_tab")

In [None]:
stop_words = set(stopwords.words(detected_language))
print(f"Stop words in {detected_language}:\n{', '.join(stop_words)}")

In [None]:
word_tokens = word_tokenize(markdown_content)
# Lowercase and trim words
word_tokens_clean = [w.lower().strip() for w in word_tokens]
filtered_words = []
for idx, word in tqdm(
    enumerate(word_tokens_clean), total=len(word_tokens_clean), desc="Processing words"
):
    word = word.lower().strip()
    # Remove markdown symbols
    if word.startswith("#"):
        continue
    # Remove punctuation
    if word in string.punctuation:
        continue
    # Remove stopwords
    if word in stop_words:
        continue
    # Remove empty strings
    if word == "":
        continue
    # Remove words with numbers
    if any(char.isdigit() for char in word):
        continue
    # Get the neighboring words, 10 in total
    context_size_in_words = 10
    start_idx = max(0, idx - context_size_in_words // 2)
    end_idx = min(len(word_tokens_clean), idx + context_size_in_words // 2)
    context = " ".join(word_tokens_clean[start_idx:end_idx])
    # Lemmatize the word
    lemmatized_word = lemmatize_word(
        word_to_lemmatize=word,
        text_chunk=context,
        language=detected_language,
        client=openai_client,
    )
    # Add the lemmatized word to the list
    filtered_words.append(lemmatized_word.lemma)
print(f"Filtered words:\n{', '.join(filtered_words)}")

## Generate the word cloud

In [None]:
def get_word_cloud(
    words: list[str],
    max_words: int = 500,
    image_path: str | None = None,
    image_name: str | None = None,
):
    """
    Create a word cloud based on a set of words.

    Args:
        words (list[str]):
            List of words to be included in the word cloud.
        max_words (int):
            Maximum number of words to be included in the word cloud.
        image_path (str):
            Path to the image file where to save the word cloud.
        image_name (str):
            Name of the image where to save the word cloud.
    """

    # change the value to black
    def black_color_func(
        word, font_size, position, orientation, random_state=None, **kwargs
    ):
        return "hsl(0,100%, 1%)"

    # set the wordcloud background color to white
    # set width and height to higher quality, 3000 x 2000
    wordcloud = WordCloud(
        background_color="white",
        width=3000,
        height=2000,
        max_words=max_words,
    ).generate(" ".join(words))
    # set the word color to black
    wordcloud.recolor(color_func=black_color_func)
    # set the figsize
    plt.figure(figsize=[15, 10])
    # plot the wordcloud
    plt.imshow(wordcloud, interpolation="bilinear")
    # remove plot axes
    plt.axis("off")
    if image_path is not None and image_name is not None:
        # save the image
        plt.savefig(os.path.join(image_path, image_name), bbox_inches="tight")

In [None]:
get_word_cloud(filtered_words)