# How to cluster contents using BERTopic 
**Link** : [BERTopic](https://maartengr.github.io/BERTopic/index.html)

## Imports

In [None]:
! pip install bertopic

In [None]:
import pandas as pd
import numpy as np
import pickle
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN

## Import and clean df

In [None]:
url = "../raw_data/contents_v3.csv"
df_content = pd.read_csv(url)

In [None]:
df_filtered = df_content[df_content['markdown'].notna() & (df_content['markdown'] != '')].copy()

In [None]:
df_filtered = df_filtered[df_filtered['type'].isin(['article', 'fiche_outils', 'guide_pratique'])].copy()

In [None]:
df = df_filtered[['id', 'type', 'markdown']].copy()

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = re.sub(r'#+\s*', '', text)  # Headers
    text = re.sub(r'\*{1,2}([^*]+)\*{1,2}', r'\1', text)  # Bold/italic
    text = re.sub(r'`([^`]+)`', r'\1', text)  # Code
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)  # Links
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text if len(text.split()) >= 3 else ""

In [None]:
df['markdown_clean'] = df['markdown'].apply(clean_text)

In [None]:
df_valid = df[df['markdown_clean'] != ''].copy()

## Let's model !

In [None]:
embedding_model = SentenceTransformer('OrdalieTech/Solon-embeddings-large-0.1')

In [None]:
french_stopwords = [
    'le', 'de', 'et', 'à', 'un', 'il', 'être', 'en', 'avoir', 'que', 'pour',
    'dans', 'ce', 'son', 'une', 'sur', 'avec', 'ne', 'se', 'pas', 'tout',
    'plus', 'par', 'grand', 'mais', 'comme', 'premier', 'leur', 'temps',
    'même', 'si', 'faire', 'ces', 'du', 'la', 'les', 'des', 'ses', 'nous',
    'vous', 'ils', 'elle', 'elles', 'cette', 'celui', 'ceux', 'celle',
    'élève', 'élèves', 'enseignant', 'enseignants', 'école', 'classe'
]

In [None]:
vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    stop_words=french_stopwords,
    min_df=2,
    max_df=0.8,
    max_features=1000
)

In [None]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=5,  # Minimum 5 docs par cluster
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

In [None]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    nr_topics="auto",
    language="french",
    verbose=True,
    calculate_probabilities=True
)

In [None]:
documents = df_valid['markdown_clean'].tolist()
topics, probabilities = topic_model.fit_transform(documents)

In [None]:
df_valid['topic'] = topics
df_valid['topic_probability'] = probabilities.max(axis=1)

In [None]:
topic_by_type = df_valid.groupby(['type', 'topic']).size().unstack(fill_value=0)

In [None]:
topic_info = topic_model.get_topic_info()
topic_info

In [None]:
topic_labels = {}
for idx, row in topic_info.iterrows():
    topic_num = row['Topic']
    if topic_num == -1:
        topic_labels[topic_num] = "Divers/Outliers"
    else:
        # Prendre les 3 premiers mots-clés comme label
        keywords = row['Representation'][:3]
        label = " | ".join(keywords).title()
        topic_labels[topic_num] = label

In [None]:
df_valid['topic_label'] = df_valid['topic'].map(topic_labels)

In [None]:
topic_stats = df_valid.groupby(['topic', 'topic_label']).agg({
    'id': 'count',
    'topic_probability': 'mean'
}).round(3)
topic_stats.columns = ['count', 'avg_probability']
topic_stats = topic_stats.reset_index()

In [None]:
cross_analysis = pd.crosstab(df_valid['topic_label'], df_valid['type'], margins=True)

In [None]:
cross_analysis

## Export pickles

In [None]:
from google.colab import drive
drive.mount('/content/drive')

with open('/content/drive/MyDrive/bertopic_model_etreprof.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

with open('/content/drive/MyDrive/topic_labels_etreprof.pkl', 'wb') as f:
    pickle.dump(topic_labels, f)