<b>BERTopic analysis</b>

In [1]:
import warnings
import os
import sys

os.environ["TOKENIZERS_PARALLELISM"] = "false" 

warnings.filterwarnings('ignore')
current_dir = %pwd

parent_dir = os.path.abspath(os.path.join(current_dir, '../..'))
sys.path.append(parent_dir)

<b>Import libraries</b>

In [2]:
import pandas as pd
from bertopic import BERTopic
from src.main.pipeline.functions import stop_words_removal
from src.main.utilities import utils

<b>Read and clean dataset</b>

In [None]:
#read and clean dataset
df = pd.read_json('../../dataset/News_Category_Dataset.json', lines=True)
df = utils.clean(df)

# Remove stopwords from full_article
df['full_article'] = df['full_article'].apply(stop_words_removal)

# Extract 50000 samples of short_description and category for Bertopic
docs = df['full_article'].tolist()[:50000]
cat = df['category'].tolist()[:50000]

<b>Create and fit BERTopic model</b>

In [None]:
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, nr_topics=21)

# Fit BERTopic model and extract topics
topics, _ = topic_model.fit_transform(docs)



<b>Plot founded topics</b>

In [None]:
print(topic_model.get_topic_info())

<b>Recreate dataframe for counting unique categories per topic</b>

In [None]:
df = pd.DataFrame({'Document': docs, 'Category': cat, 'Topic': topics})
df = df.groupby('Topic')['Category'].nunique()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))  
plt.bar(df.index, df.values, color='skyblue')

plt.xlabel('Topics')
plt.ylabel('Number of Unique Categories')
plt.title('# Unique Categories per Topic')

plt.show()