## Importer les librairies nécessaires

In [1]:
import os
import json
from bs4 import BeautifulSoup

## Approche_0 : titre et url 

In [2]:
# Définir la fonction parser avec beautifulsoup
def parser_favoris(html_file):
    """Parser un fichier HTML de favoris."""
    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
    
    # Trouver tous les liens
    links = soup.find_all('a')
    
    favoris = []
    for link in links:
        href = link.get('href')
        if href:
            title = link.text
            favoris.append({'title': title, 'href': href})
    
    return favoris

In [3]:
# Définir une fonction pour combiner les 2 listes de favoris, tout en supprimant les doublons d'url parmi eux
def combiner_favoris(favoris_pro, favoris_perso):
    """Combiner deux listes de favoris, en supprimant les doublons."""
    # Utiliser un dictionnaire pour supprimer les doublons
    favoris_dict = {}
    for favori in favoris_pro + favoris_perso:
        favoris_dict[favori['href']] = favori
    
    # Convertir le dictionnaire en liste
    favoris_combines = list(favoris_dict.values())
    
    return favoris_combines

In [4]:
# Définir une fonction qui sauvegarde en json les favoris combinés
def sauvegarder_favoris(favoris, output_file):
    """Sauvegarder les favoris combinés dans un fichier JSON."""
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(favoris, file, indent=4, ensure_ascii=False)

In [5]:
if __name__ == "__main__":
    favoris_pro_file = 'favoris_pro.html'
    favoris_perso_file = 'favoris_perso.html'
    output_file = 'favoris_combines.json'
    
    favoris_pro = parser_favoris(favoris_pro_file)
    favoris_perso = parser_favoris(favoris_perso_file)
    
    favoris_combines = combiner_favoris(favoris_pro, favoris_perso)
    
    sauvegarder_favoris(favoris_combines, output_file)
    
    print("Favoris combinés avec succès!")

Favoris combinés avec succès!


In [6]:
# Créer un dataframe du fichier contenant les favoris combinés
import pandas as pd
import re

# Charger le fichier JSON dans un DataFrame
df_bookmark = pd.read_json('favoris_combines.json')

# Supprimer les émojis et les caractères non ASCII
df_bookmark['title'] = df_bookmark['title'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

# Information sur le DataFrame
df_bookmark.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3334 entries, 0 to 3333
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   3334 non-null   object
 1   href    3334 non-null   object
dtypes: object(2)
memory usage: 52.2+ KB


In [7]:
df_bookmark.head(20)

Unnamed: 0,title,href
0,Gemini,https://accounts.google.com/v3/signin/identifi...
1,HuggingChat,https://huggingface.co/chat/
2,Blackbox.ai,https://www.blackbox.ai/chat/expert-python
3,Python Formatter and Beautifier,https://codebeautify.org/python-formatter-beau...
4,GitHub - manojVivek/medium-unlimited at the-tl...,https://github.com/manojVivek/medium-unlimited...
5,Khuyen's Links,https://bit.ly/m/khuyentran
6,Rob Mulla | Grandmaster | Kaggle,https://www.kaggle.com/robikscube
7,"CodeCut Stay sharp, learn in a snap",https://codecut.ai/?utm_source=linkedin.com%2F...
8,"CodeCut Stay sharp, learn in a snap",https://codecut.ai/
9,How to fix - Python pip install connection err...,https://jhooq.com/pip-install-connection-error/


## Machine Learning : Création d'étiquettes de thème en automatique

In [8]:
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.util import minibatch, compounding
from spacy import displacy
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop


In [9]:
df_bookmark.head(25)

Unnamed: 0,title,href
0,Gemini,https://accounts.google.com/v3/signin/identifi...
1,HuggingChat,https://huggingface.co/chat/
2,Blackbox.ai,https://www.blackbox.ai/chat/expert-python
3,Python Formatter and Beautifier,https://codebeautify.org/python-formatter-beau...
4,GitHub - manojVivek/medium-unlimited at the-tl...,https://github.com/manojVivek/medium-unlimited...
5,Khuyen's Links,https://bit.ly/m/khuyentran
6,Rob Mulla | Grandmaster | Kaggle,https://www.kaggle.com/robikscube
7,"CodeCut Stay sharp, learn in a snap",https://codecut.ai/?utm_source=linkedin.com%2F...
8,"CodeCut Stay sharp, learn in a snap",https://codecut.ai/
9,How to fix - Python pip install connection err...,https://jhooq.com/pip-install-connection-error/


In [10]:
# Définir le vectorizer avec les modèles de langage
final_stopwords_list = list(fr_stop) + list(en_stop)
tfidf_vectorizer = TfidfVectorizer( 
    max_df=0.985,
    # max_features=2000000,
    min_df=0.00005,
    stop_words=final_stopwords_list,
    use_idf=True,  
    ngram_range=(1,12)
)

In [11]:
# Charger les données
df_bookmark = pd.read_json('favoris_combines.json')

# Prétraitement des données
df_bookmark['title'] = df_bookmark['title'].apply(lambda x: x.lower())
df_bookmark['title'] = df_bookmark['title'].apply(lambda x: x.replace(',', '').replace('.', '').replace('|', '-'))
# Supprimer les émojis et les caractères non ASCII
df_bookmark['title'] = df_bookmark['title'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))                                                                  


# Transformer les données
X = tfidf_vectorizer.fit_transform(df_bookmark['title'])



In [12]:
# Déterminer le nombre optimal de clusters
sse = []
silhouette = []
for k in range(10,100):
    kmeans = KMeans(n_clusters=k,)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X, kmeans.labels_))

# Tracer la courbe de la SSE
fig_sse = go.Figure(data=[go.Scatter(x=list(range(10,100)), y=sse, mode='lines+markers')])
fig_sse.update_layout(title='Méthode du coude', xaxis_title='Nombre de clusters', yaxis_title='SSE')
fig_sse.show()

# Tracer la courbe du coefficient de silhouette
fig_silhouette = go.Figure(data=[go.Scatter(x=list(range(10,100)), y=silhouette, mode='lines+markers')])
fig_silhouette.update_layout(title='Coefficient de silhouette', xaxis_title='Nombre de clusters', yaxis_title='Coefficient de silhouette')
fig_silhouette.show()

# Déterminer le nombre optimal de clusters
k_optimal = np.argmax(silhouette) + 2
print(f'Nombre optimal théorique de clusters : {k_optimal}')

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Nombre optimal théorique de clusters : 7


Nous prendrons k=15 clusters

In [13]:
# Transformer les données
X = tfidf_vectorizer.fit_transform(df_bookmark['title'])

# Déterminer le nombre optimal de clusters
n_clusters = 15

# Faire un clustering des données
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

# Obtenir les étiquettes pour chaque cluster
labels = kmeans.labels_

# Créer un dictionnaire pour stocker les étiquettes des clusters
cluster_labels = {}

In [14]:
labels

array([0, 0, 0, ..., 0, 0, 0], shape=(3334,), dtype=int32)

In [15]:
# Utiliser spaCy pour faire un étiquetage automatique des clusters
nlp_fr = spacy.load("fr_dep_news_trf")


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



In [16]:
for i in range(n_clusters):
    cluster_docs = df_bookmark[labels == i]['title']
    cluster_text =' '.join(cluster_docs)  # Utiliser un espace pour séparer les documents
    doc = nlp_fr(cluster_text)
    entities = [ent.text for ent in doc.ents]
    nouns = [token.text for token in doc if token.pos_ == 'NOUN']
    verbs = [token.text for token in doc if token.pos_ == 'VERB']
    keywords = list(set(entities + nouns + verbs))  # Utiliser un set pour supprimer les doublons
    cluster_label = ', '.join(keywords[:6])  # Prendre les 5 premiers mots clés
    cluster_labels[i] = cluster_label

In [17]:
print(cluster_docs)

2809    (44) creating dummy data in python using faker...
Name: title, dtype: object


In [18]:
len(cluster_docs)

1

In [19]:
cluster_label

''

In [None]:
# Répercuter les étiquettes dans la colonne "theme" de df_bookmark
df_bookmark['theme'] = df_bookmark.index.map(lambda x: cluster_labels[labels[x]])

In [None]:
df_bookmark.head(25)