In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud
import plotly.graph_objects as go

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk import FreqDist
from nltk import ngrams

from typing import Any, Optional

from bertopic import BERTopic
#from plotly.offline import init_notebook_mode
#
#init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Edward\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Edward\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_text(columna_datos):
    pattern = r'''(?x)                 # set flag to allow verbose regexps
              (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
              | \w+(?:-\w+)*       # words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
              | \.\.\.             # ellipsis
              | [][.,;"'?():-_`]   # these are separate tokens; includes ], [
    '''
    textos = columna_datos

    # Tokenizar cada fila
    tokens = [nltk.regexp_tokenize(texto, pattern) for texto in textos]

    columna_tokens = pd.DataFrame()
    columna_tokens["Tokens"] = tokens

    # Lematizador
    lemmatizer = WordNetLemmatizer()
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

    # Eliminar stopwords:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [token for token in tokens if token.lower() not in stop_words])

    # Eliminar carácteres especiales y palabras cortas:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [token for token in tokens if token.isalnum() and len(token) >=3])

    # Convertir todo a minúsculas:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [token.lower() for token in tokens])

    # Convertir todo a texto:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: ' '.join([token.lower() for token in tokens]))

    return columna_tokens

In [4]:
data = pd.read_csv("./Datos ecopetrol/Datos_ecopetrol_semanales_divididos.csv")
data["Date"] = pd.to_datetime(data["Date"])
data = data.sort_values(by="Date")
data["text"] = clean_text(data["Title"])

train_weekly = data.loc[data["Split"] == "Train"]
val_weekly = data.loc[data["Split"] == "Validation"]
test_weekly = data.loc[data["Split"] == "Test"]

In [5]:
topic_model = BERTopic.load("./Modelos/bertopic_model")

In [6]:
%%capture
assigned_topics = []
for i in val_weekly["text"]:
    topics_pred, probs_pred = topic_model.transform(i)
    assigned_topics.extend(topics_pred)
val_weekly["Topic"] = assigned_topics
val_weekly.to_csv("./Datos ecopetrol/Datos_ecopetrol_topicos_validation.csv", index=False)