# Topic Modeling Training

In [1]:
import openai
import pandas as pd
import dask.dataframe as dd 

from bertopic import BERTopic

from sentence_transformers import SentenceTransformer 
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# We train on Boston globe Data
lndf = dd.read_csv(
    "./TBG/TBG_unique_raw.csv",
    blocksize=100e6, 
    dtype={'author': 'object', 'edition': 'object','licensor_indexing_terms': 'object'}
)

In [3]:
lndf.dropna

<bound method DataFrame.dropna of Dask DataFrame Structure:
               pub_type position_section position_subsection     hl1     hl2  author    lede    body language word_count copyright content-id   volume issue_number edition pub_name pub_date licensor_indexing_terms indexing_terms
npartitions=81                                                                                                                                                                                                                      
                 object           object              object  object  object  object  object  object   object    float64    object     object  float64      float64  object   object   object                  object         object
                    ...              ...                 ...     ...     ...     ...     ...     ...      ...        ...       ...        ...      ...          ...     ...      ...      ...                     ...            ...
...                 ... 

In [4]:
sample = lndf.sample(frac=0.8, random_state=1)
sample_pd = sample.compute()

In [5]:
print("cleaning article body text...")
new_body = []
for body in sample["body"]:
    if body == "body":
        new_body.append("")
    else:
        try:
            new_body.append(body.split("body ")[1])
        except IndexError:
            new_body.append(body)
        except AttributeError:
            new_body.append("")
clean_body=pd.Series(new_body)

cleaning article body text...


In [6]:
print("initializing embeddings...")
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
vectorizer_model = CountVectorizer(stop_words='english')
representation_model = MaximalMarginalRelevance(diversity=0.8)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

initializing embeddings...


In [12]:
clean_body = clean_body[:1000] # Cut it short as we are not interested in training an effective model

In [13]:
embeddings_body = sentence_model.encode(clean_body)

In [14]:
print("fitting topic model...")
bglobe_topicmodel = BERTopic(
    n_gram_range = (1,3),
    embedding_model=sentence_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    ctfidf_model=ctfidf_model
)
topic_lede, prob_lede = bglobe_topicmodel.fit_transform(clean_body, embeddings_body)

fitting topic model...


In [15]:
print("save topic model...")
bglobe_topicmodel.save("./BERTopic_CPU_M1")

save topic model...
