In [None]:
%pip install spacy

%pip install nltk
%pip install bertopic
%pip install seaborn

In [None]:
import pandas as pd

# Read CSV
df_contratos = pd.read_csv("read csv exported from MAIN.IPYNB/ API REQUEST")
# Display the first few rows
print(df_contratos.head())

In [None]:
def clean_lemma_batch(texts, stop_words, nlp):
    """
    Lemmatizes and cleans a list of texts using spaCy, removing stopwords, punctuation, and spaces.

    Parameters:
    - texts: list of str
        The input texts to process. For example, you can pass a column from your dataframe like:
        df_contratos['objeto_del_contrato'].astype(str).tolist()
    - stop_words: set
        A set of stopwords to remove from the lemmatized tokens. You should pass your list of stopwords
        here as a set, for example: set(stopwords.words('spanish')) or a custom set.
    - nlp: spacy.lang object
        The loaded spaCy language model. For example, you should load it before calling this function:
        nlp = spacy.load("es_core_news_sm", disable=["ner", "parser"])

    Returns:
    - cleaned_texts: list of str
        The processed texts, where each text is a string of lemmatized tokens separated by spaces,
        with stopwords, punctuation, and spaces removed.

    Where to place everything:
    1. Load your dataframe (df_contratos) as you already do.
    2. Prepare your stopwords set, e.g.:
           from nltk.corpus import stopwords
           stop_words = set(stopwords.words('spanish'))
           # Optionally, add your own custom stopwords:
           stop_words.update(['palabra1', 'palabra2', ...])
    3. Load the spaCy model:
           import spacy
           nlp = spacy.load("es_core_news_sm", disable=["ner", "parser"])
    4. Extract the texts from your dataframe:
           texts = df_contratos['objeto_del_contrato'].astype(str).tolist()
    5. Call this function:
           cleaned = clean_lemma_batch(texts, stop_words, nlp)

    You do NOT need to put the dataframe inside this function. You only pass the column you want to process as a list of strings.

    Example usage:
        import spacy
        from nltk.corpus import stopwords

        stop_words = set(stopwords.words('spanish'))
        stop_words.update(['palabra1', 'palabra2'])  # add your custom stopwords here
        nlp = spacy.load("es_core_news_sm", disable=["ner", "parser"])
        texts = df_contratos['objeto_del_contrato'].astype(str).tolist()
        cleaned = clean_lemma_batch(texts, stop_words, nlp)
    """
    cleaned_texts = []
    for doc in nlp.pipe(texts, batch_size=1000, disable=["parser", "ner"]):
        tokens = [
            token.lemma_ for token in doc
            if token.lemma_ not in stop_words
            and not token.is_punct
            and not token.is_space
        ]
        cleaned_texts.append(' '.join(tokens))
    return cleaned_texts

In [None]:
import nltk
from nltk.corpus import stopwords
import spacy

# Define the function to clean and lemmatize a batch of texts
def clean_lemma_batch(texts, stop_words, nlp):
    cleaned_texts = []
    for doc in nlp.pipe(texts, batch_size=1000, disable=["parser", "ner"]):
        tokens = [
            token.lemma_ for token in doc
            if token.lemma_ not in stop_words
            and not token.is_punct
            and not token.is_space
        ]
        cleaned_texts.append(' '.join(tokens))
    return cleaned_texts

# Prepare stopwords and custom contract stopwords
contract_stopwords = {
    'prestacion', 'servicio', 'proceso', 'contratista', 'acuerdo', 'ejecutar',
    'condicion', 'empresa', 'nuevo', 'institucional', 'entidad', 'area',
    'eficaz', 'eficiente', 'municipio', 'nivel', 'centro', 'ejecución',
    'prestación', 'objeto', 'dentro', 'requerido', 'diferente', 'poner',
    'realizar',
    'apoyo', 'departamental', 'contrato', 'laboral', 'actividad', 'cuenta',
    'profesional', 'contratar', 'especializado', 'prestar', 'área',
    'subproceso', 'comprometer', 'garantizar', 'requerir', 'adelantar',
    'local', 'primero', 'conformidad', 'responsabilidad', 'efectivo',
    'disposición', 'forma', 'propuesta', 'bajo', 'oportuno', 'tiempo', 'autonomía'
}

# Example usage (to be run outside this cell, not inside the function):
# nltk.download('stopwords')
# stop_words = set(stopwords.words('spanish'))
# stop_words.update(contract_stopwords)
# nlp = spacy.load("es_core_news_sm", disable=["ner", "parser"])
# texts = df_contratos['objeto_del_contrato'].astype(str).tolist()
# cleaned = clean_lemma_batch(texts, stop_words, nlp)

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))
stop_words.update(contract_stopwords)
nlp = spacy.load("es_core_news_sm", disable=["ner", "parser"])


In [None]:
def process_dataframe_in_chunks(df_contratos, chunk_size=10000):
    total_chunks = len(df_contratos) // chunk_size + 1
    cleaned_results = []

    for i in range(total_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(df_contratos))
        chunk = df_contratos.iloc[start:end].copy()
        print(f"🔹 Processing chunk {i+1}/{total_chunks} ({end-start} rows)")

        texts = chunk['objeto_del_contrato'].astype(str).tolist()
       




In [None]:
def process_dataframe_in_chunks(df, stop_words, nlp, chunk_size=10000):
    total_chunks = len(df) // chunk_size + 1
    cleaned_results = []

    for i in range(total_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(df))
        chunk = df.iloc[start:end].copy()
        print(f"🔹 Processing chunk {i+1}/{total_chunks} ({end-start} rows)")

        texts = chunk['objeto_del_contrato'].astype(str).tolist()
        chunk['objeto_clean_lemma'] = clean_lemma_batch(texts, stop_words, nlp)

        cleaned_results.append(chunk)

    return pd.concat(cleaned_results, ignore_index=True)


In [None]:
df_cleaned = process_dataframe_in_chunks(df_contratos, stop_words, nlp)


In [None]:
# Load model once
from bertopic import BERTopic
topic_model = BERTopic.load("my_bertopic_modelv07",embedding_model="all-MiniLM-L6-v2")


# Transform in batches if needed
docs = df_cleaned['objeto_clean_lemma'].tolist()
topics, probs = topic_model.transform(docs)

df_cleaned['predicted_topic'] = topics
df_cleaned['topic_probability'] = probs


In [None]:
# Save DataFrame with topics to CSV
df_cleaned.to_csv("contratos_2024_labeled.csv", index=False, encoding='utf-8-sig')


In [None]:
# Save DataFrame with topics to a pickle file
df_cleaned.to_pickle("contratos_2024_labeled.pkl")