# Imports

In [27]:
import pandas as pd
from transformers import AutoTokenizer, AutoConfig
from sentence_transformers import SentenceTransformer
from numpy import logical_and
from bertopic import BERTopic

# Parameters

In [28]:
model_name = "Alibaba-NLP/gte-multilingual-base"

# Pre-processing 

In [29]:
df_raw = pd.read_csv("../../data/theses-soutenues-clean-with-index.csv")

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

df_raw["resumes.en.len"] = df_raw["resumes.en"].apply(lambda res : len(res))
df_raw["resumes.fr.len"] = df_raw["resumes.fr"].apply(lambda res : len(res))

In [31]:
df_raw["resumes.en.len"].describe()

count    164379.000000
mean       1777.648082
std         735.027732
min           1.000000
25%        1324.000000
50%        1617.000000
75%        2080.000000
max       12010.000000
Name: resumes.en.len, dtype: float64

In [32]:
df_raw["resumes.fr.len"].describe()

count    164379.000000
mean       1984.935119
std         802.720810
min           6.000000
25%        1508.000000
50%        1702.000000
75%        2362.000000
max       12207.000000
Name: resumes.fr.len, dtype: float64

In [33]:
config = AutoConfig.from_pretrained(model_name, trust_remote_code = True)
config.max_position_embeddings

8192

In [34]:
valid_index = logical_and.reduce([
    df_raw["resumes.fr.len"] >= 1000,
    df_raw["resumes.fr.len"] <= 4000,
    df_raw["resumes.en.len"] >= 1000,
    df_raw["resumes.en.len"] <= 4000,
])
df = df_raw.loc[valid_index,:]
print(f"Proportion of the dataset preserved: {100 * len(df) / len(df_raw):.0f} %")

Proportion of the dataset preserved: 89 %


In [35]:
df_stratified = (
    df
    .groupby("year")
    .apply(lambda x : x.sample(n = 500), include_groups=False)
)
print(f"Size of the dataset: {len(df_stratified)}")

Size of the dataset: 6500


In [40]:
docs = df_stratified["resumes.en"].sample(frac = 1).to_list()
sentence_model = SentenceTransformer(model_name, trust_remote_code = True)
embeddings = sentence_model.encode(docs, show_progress_bar=True, )

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/204 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
topic_model = BERTopic(language = "english").fit(documents=docs)

In [24]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 96/96 [00:00<00:00, 435.36it/s]


In [25]:
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)

KeyError: 1580