In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud
import plotly.graph_objects as go

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk import FreqDist
from nltk import ngrams

from typing import Any, Optional

from bertopic import BERTopic
#from plotly.offline import init_notebook_mode
#
#init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Edward\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Edward\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_text(columna_datos):
    pattern = r'''(?x)                 # set flag to allow verbose regexps
              (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
              | \w+(?:-\w+)*       # words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
              | \.\.\.             # ellipsis
              | [][.,;"'?():-_`]   # these are separate tokens; includes ], [
    '''
    textos = columna_datos

    # Tokenizar cada fila
    tokens = [nltk.regexp_tokenize(texto, pattern) for texto in textos]

    columna_tokens = pd.DataFrame()
    columna_tokens["Tokens"] = tokens

    # Lematizador
    lemmatizer = WordNetLemmatizer()
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

    # Eliminar stopwords:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [token for token in tokens if token.lower() not in stop_words])

    # Eliminar carácteres especiales y palabras cortas:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [token for token in tokens if token.isalnum() and len(token) >=3])

    # Convertir todo a minúsculas:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: [token.lower() for token in tokens])

    # Convertir todo a texto:
    columna_tokens["Tokens"] = columna_tokens["Tokens"].apply(lambda tokens: ' '.join([token.lower() for token in tokens]))

    return columna_tokens

In [3]:
data = pd.read_csv("./Datos ecopetrol/Datos_ecopetrol_semanales_divididos.csv") 
data["Date"] = pd.to_datetime(data["Date"])
data = data.sort_values(by="Date")
data["text"] = clean_text(data["Title"])

train_weekly = data.loc[data["Split"] == "Train"]
val_weekly = data.loc[data["Split"] == "Validation"]
test_weekly = data.loc[data["Split"] == "Test"]

In [17]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)

topic_model = BERTopic(language="english",nr_topics=40, calculate_probabilities=True, verbose=True, n_gram_range=(1, 1), umap_model=umap_model)
topics, probs = topic_model.fit_transform(train_weekly["text"])

# Assign a topic to each new
train_weekly["Topic"] = topics

2024-04-14 11:46:34,058 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 118/118 [00:25<00:00,  4.69it/s]
2024-04-14 11:46:59,681 - BERTopic - Embedding - Completed ✓
2024-04-14 11:46:59,682 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-14 11:47:25,458 - BERTopic - Dimensionality - Completed ✓
2024-04-14 11:47:25,459 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-14 11:47:27,266 - BERTopic - Cluster - Completed ✓
2024-04-14 11:47:27,267 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-14 11:47:27,435 - BERTopic - Representation - Completed ✓
2024-04-14 11:47:27,437 - BERTopic - Topic reduction - Reducing number of topics
2024-04-14 11:47:27,555 - BERTopic - Topic reduction - Reduced number of topics from 82 to 40


In [20]:
train_weekly.to_csv("./Datos ecopetrol/Datos_ecopetrol_topicos_train.csv", index=False)

In [21]:
topic_model.save("./Modelos/bertopic_model")



In [18]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,889,-1_ecopetrol_oil_production_company,"[ecopetrol, oil, production, company, year, ne...","[plan increase gas reserve, gas new oil, oil c..."
1,0,597,0_ecopetrol_million_billion_000,"[ecopetrol, million, billion, 000, dividend, b...",[nation would receive billion ecopetrol divide...
2,1,396,1_colombia_fracking_colombian_stock,"[colombia, fracking, colombian, stock, pilot, ...","[colombia oil, fracking colombia, fracking col..."
3,2,264,2_oil_refinery_offshore_barrancabermeja,"[oil, refinery, offshore, barrancabermeja, spi...","[echeverry barrancabermeja refinery, barrancab..."
4,3,221,3_ecp_echeverry_plan_export,"[ecp, echeverry, plan, export, result, distrib...","[ecp investment plan, ecp investment plan, ecp..."
5,4,191,4_fuel_price_gas_natural,"[fuel, price, gas, natural, propane, minminas,...","[fuel price march, minminas increase fuel pric..."
6,5,163,5_uso_strike_protest_putumayo,"[uso, strike, protest, putumayo, agreement, wo...","[uso talk protest, uso, uso strike]"
7,6,94,6_csr_summary_positive_project,"[csr, summary, positive, project, report, star...","[may start positive report csr project, august..."
8,7,82,7_rubiales_pacific_field_reversion,"[rubiales, pacific, field, reversion, quifa, d...","[pacific rubiales report 2013 reserve, pacific..."
9,8,71,8_pipeline_attack_trasandino_contingency,"[pipeline, attack, trasandino, contingency, dr...",[ecopetrol activated contingency plan attack o...
