In [None]:
def calculate_date_interval(df, figsize):
    """
    Calcule l'intervalle dynamique pour les dates sur l'axe des abscisses.
    """
    # Calcul de la largeur totale en pixels
    fig_width, fig_height = figsize
    dpi = plt.gcf().dpi
    total_width_px = fig_width * dpi

    # Estimation de la largeur d'une date en pixels
    sample_date = df.index[0].strftime('%d-%m-%Y')
    plt.figure()
    temp_text = plt.text(0.5, 0.5, sample_date)
    plt.draw()
    bbox = temp_text.get_window_extent()
    date_width_px = bbox.width
    plt.close()

    date_width_px = date_width_px / 4

    # Calcul de l'intervalle
    max_dates_without_overlap = total_width_px / date_width_px
    interval = max(1, int(np.floor(len(df.index) / max_dates_without_overlap)))

    return interval

In [None]:
"""
Applies a transformer-based lemmatisation to each document using several processes (one for each cpu core)
"""
def treat_documents():
    global documents_lemmatized, all_tab_pos, sentences_norms, sentences_lemmas, language

    # Use synchronized queues to safely share data between processes
    input_queue = Queue()
    output_queue = Queue()

    # Démarrage des processus de travail
    workers = []
    for _ in range(cpu_count()):
        p = Process(target=worker, args=(input_queue, output_queue, language, {
            "total_docs": len(documents)
        }))
        p.start()
        workers.append(p)

    # Ajouter les documents à la input_queue avec leur index
    for idx, doc in enumerate(documents):
        input_queue.put((idx, doc))

    results = []
    # Récupération des résultats avec barre de progression
    for _ in tqdm(range(len(documents)), desc="DOCUMENTS PROCESSED", maxinterval=1000, bar_format="{l_bar}{bar:10}{r_bar}"):
        results.append(output_queue.get())

    # Envoyer le signal d'arrêt aux processus de travail
    for _ in workers:
        input_queue.put("STOP")

    # Attendre la fin de tous les processus
    for p in workers:
        p.join()

    # Trier les résultats en fonction de leur index et agréger les données
    results.sort(key=lambda x: x[0])
    for result in results:
        idx, document_lemmatized, tab_pos, norms, lemmas = result
        documents_lemmatized.append(document_lemmatized)
        all_tab_pos.append(tab_pos)
        sentences_norms.append(norms)
        sentences_lemmas.append(lemmas)

In [None]:
def extract_terms_with_scores(spacy_object, nb_candidates_to_extract_from_terms_extractions):
    def get_top_terms_with_scores(attribute):
        # Extraire les termes et leurs scores
        sorted_terms = attribute.sort_values(ascending=False)
        top_terms = sorted_terms.head(nb_candidates_to_extract_from_terms_extractions)
        return list(top_terms.items())

    # Obtenir les n-grammes cvalues avec leurs scores
    cvalues_terms = get_top_terms_with_scores(spacy_object._.cvalues)

    return cvalues_terms

In [None]:
def extract_terms(spacy_objects, nb_candidates_to_extract_from_terms_extractions):
    cvalues_results = []

    for spacy_obj in spacy_objects:
        cvalues_terms = extract_terms_with_scores(spacy_obj, nb_candidates_to_extract_from_terms_extractions)
        cvalues_results.append(cvalues_terms)

    # Accumuler les scores pour chaque multigramme unique
    score_accumulation = defaultdict(float)

    for sublist in cvalues_results:
        for term, score in sublist:
            score_accumulation[term] += score

    # Trier les multigrammes par leur score cumulé
    sorted_terms = sorted(score_accumulation.items(), key=lambda x: x[1], reverse=True)

    # Sélectionner les nb_candidates_to_extract_from_terms_extractions multigrammes avec les scores les plus élevés
    all_added_ngrams = sorted_terms[:nb_candidates_to_extract_from_terms_extractions]

    return all_added_ngrams

In [None]:
def go_tfidf_vectorization(unigrams):
    args_list = [(atb,
                unigrams) for atb in all_tab_pos]

    def parallel_tokenize_multiprocessing(args_list):
        with multiprocessing.Pool() as pool:
            return pool.map(tokenize_and_stem, args_list)

    tokenized_documents = parallel_tokenize_multiprocessing(args_list)

    #tfidf_vectorizer = TfidfVectorizer(tokenizer=None, norm='l2', sublinear_tf=sub_linear_tfidf) # normalisation pour considérer les tailles différentes de documents

    def identity_analyzer(tokens):
        return tokens

    count_vectorizer = CountVectorizer(analyzer=identity_analyzer, lowercase=False)
    word_count = count_vectorizer.fit_transform(tokenized_documents)

    tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=sub_linear_tfidf)
    X = tfidf_transformer.fit_transform(word_count)

    tfidf_feature_names = count_vectorizer.get_feature_names_out()

    return count_vectorizer, X, tfidf_feature_names, tokenized_documents

In [None]:
def write_sentences_results(out_name, final_top_ngrams_per_topic):
    with open(out_name, "w", encoding='utf-8') as file_object:
      writer = csv.writer(file_object)

      # Écrire les en-têtes si nécessaire
      headers = []
      for i in range(len(final_top_ngrams_per_topic)):
          headers.extend([f'{i}_sentences', f'{i}_scores'])
      writer.writerow(headers)

      # Écrire les données
      for i in range(10):
          row = []
          for sub_array in final_top_ngrams_per_topic:
              if i < len(sub_array):
                  row.extend(sub_array[i])
              else:
                  row.extend(('', ''))
          writer.writerow(row)

In [None]:
def write_unigrams_results(nb_words, tfidf_feature_names, out_name, nmf_H):

    tab_words_nmf = []
    for topic_idx, topic in enumerate(nmf_H):
      subtab_words_nmf = []
      for i in topic.argsort()[:-nb_words - 1:-1]:
        subtab_words_nmf.append([(tfidf_feature_names[i]), topic[i]])

      tab_words_nmf.append(subtab_words_nmf)


    new_tab_words_nmf = []
    for t in tab_words_nmf:
      sorted_t = sorted(t, key = lambda x: (-x[1]))

      new_tab_words_nmf.append(sorted_t)



    max_rows_nb = 0
    for to in new_tab_words_nmf:
      if len(to) > max_rows_nb:
        max_rows_nb = len(to)


    with open(out_name, "w", encoding='utf-8') as file_object:
      writer = csv.writer(file_object)

      i = 0
      while i < max_rows_nb:
        new_row = ''
        for to in new_tab_words_nmf :
          if i < len(to):
            if len(new_row) > 0:
              new_row = new_row + ',' + (to[i][0]) + ',' + str(to[i][1])
            else:
              new_row = (to[i][0]) + ',' + str(to[i][1])

        file_object.write(new_row)
        file_object.write('\n')

        i += 1

In [None]:
"""
Imports all required libraries for the notebook to function. If a library is not present on the system, it is downloaded instead
"""
def download_things():
    global folder_path
    global language

    libraries = {
        'seaborn': ('import seaborn', 'pip install seaborn'),
        'gensim': ('import gensim', 'pip install gensim'),
        'cleantext': ('import cleantext', 'pip install cleantext'),
        'adjustText': ('import adjustText', 'pip install adjustText'),
        'umap': ('import umap', 'pip install umap-learn'),
        'nltk': ('import nltk', 'pip install nltk'),
        'sklearn': ('import sklearn', 'pip install -U scikit-learn'),
        'scipy': ('import scipy', 'pip install -U scipy'),
        'matplotlib': ('import matplotlib', 'pip install -U matplotlib'),
        'spacy': ('import spacy', 'pip install spacy'),
        'spacy-transformers': ('import transformers', 'pip install spacy-transformers'),
        'pyate': ('import pyate', 'pip install pyate'),
        'multiprocess': ('import multiprocess', 'pip install multiprocess'),
        'bs4': ('import bs4', 'pip install beautifulsoup4'),
        'unidecode': ('import unidecode', 'pip install unidecode'),
        'statmodels': ('import statmodels', 'pip install statmodels')
    }

    for library, (import_cmd, pip_cmd) in libraries.items():
        print('go', library)
        try:
            if import_cmd:  # If there's an import command provided
                exec(import_cmd)
            else:
                raise ImportError  # This will force the code to the except block
        except ImportError:
            result = subprocess.run(pip_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Download the spacy model based on the language selected by the user
    if language == 'fr':
        %pip install -r ./requirements/requirements_fr.txt
    elif language == 'en':
        !spacy download en_core_web_trf
    elif language == 'es':
        !spacy download es_dep_news_trf
    elif language == 'de':
        !spacy download de_dep_news_trf
    elif language == 'ca':
        !spacy download ca_core_news_trf
    elif language == 'zh':
        !spacy download zh_core_web_trf
    elif language == 'da':
        !spacy download da_core_news_trf
    elif language == 'ja':
        !spacy download ja_core_news_trf
    elif language == 'sl':
        !spacy download sl_core_news_trf
    elif language == 'uk':
        !spacy download uk_core_news_trf
    
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
def determine_nmf(initial_topic_num=5, terminal_topic_num=20):
    global all_nmf_H
    global all_nmf_W
    global relevant_i_by_topics
    global every_topics_and_scores_by_document

    for num_topic in tqdm(range(initial_topic_num, terminal_topic_num+5, 5),
                          bar_format="{l_bar}{bar:10}{r_bar}",
                          desc="TOPICS CONFIGURATIONS PROCESSED"):
        relevant_i_by_topics[num_topic] = {}

        nmf_model = NMF(n_components=num_topic, random_state=1, max_iter=2000, l1_ratio=0.5, init='nndsvd').fit(tfidf)
        nmf_W = nmf_model.transform(tfidf)
        all_nmf_W[num_topic] = nmf_W
        nmf_H = nmf_model.components_
        all_nmf_H[num_topic] = nmf_H

        all_topics_and_scores_by_document = {}

        for i in range(len(nmf_W)):
            topics_and_scores = [(topic_num, nmf_W[i][topic_num]) for topic_num in range(nmf_W[i].shape[0])]
            topics_and_scores_sorted = sorted(topics_and_scores, key=lambda x: x[1], reverse=True)
            all_topics_and_scores_by_document[i] = topics_and_scores_sorted

            if topics_and_scores_sorted[0][0] in relevant_i_by_topics[num_topic]:
                relevant_i_by_topics[num_topic][topics_and_scores_sorted[0][0]].append([i, topics_and_scores_sorted[0][1]])
            else:
                relevant_documents_array = []
                relevant_documents_array.append([i, topics_and_scores_sorted[0][1]])
                relevant_i_by_topics[num_topic][topics_and_scores_sorted[0][0]] = relevant_documents_array

        every_topics_and_scores_by_document[num_topic] = all_topics_and_scores_by_document

    for num_topic in relevant_i_by_topics:
        for topic in relevant_i_by_topics[num_topic]:
            sorted_data = sorted(relevant_i_by_topics[num_topic][topic], key=lambda x: x[1], reverse=True)
            top_10 = sorted_data[:10]
            indices_top_10 = [item[0] for item in top_10]
            relevant_i_by_topics[num_topic][topic] = indices_top_10

In [None]:
def terms_and_entities_extraction():
    global language

    def convert_to_dict(strings_list):
        result_dict = {}
        for string in strings_list:
            if string in result_dict:
                result_dict[string] += 1
            else:
                result_dict[string] = 1
        return result_dict

    if not Doc.has_extension("cvalues"):
        Doc.set_extension("cvalues", default=None)

    if language == 'fr':
        nlp_for_ngrams = spacy.load('fr_dep_news_trf')
    elif language == 'en':
        nlp_for_ngrams = spacy.load('en_core_web_trf')
    elif language == 'es':
        nlp_for_ngrams = spacy.load('es_dep_news_trf')
    elif language == 'de':
        nlp_for_ngrams = spacy.load('de_dep_news_trf')
    elif language == 'ca':
        nlp_for_ngrams = spacy.load('ca_core_news_trf')
    elif language == 'zh':
        nlp_for_ngrams = spacy.load('zh_core_web_trf')
    elif language == 'da':
        nlp_for_ngrams = spacy.load('da_core_news_trf')
    elif language == 'ja':
        nlp_for_ngrams = spacy.load('ja_core_news_trf')
    elif language == 'sl':
        nlp_for_ngrams = spacy.load('sl_core_news_trf')
    elif language == 'uk':
        nlp_for_ngrams = spacy.load('uk_core_news_trf')

    nlp_for_ngrams.disable_pipes('lemmatizer')

    nlp_for_ngrams.add_pipe("cvalues")

    all_terms = {}
    all_entities = {}
    for num_topic in tqdm(relevant_i_by_topics, bar_format="{l_bar}{bar:10}{r_bar}", desc="TOPICS CONFIGURATIONS PROCESSED"):
        all_terms[num_topic] = {}
        all_entities[num_topic] = {}
        for topic in relevant_i_by_topics[num_topic]:
            ground_text = ''
            entities_dict = {}
            for i in relevant_i_by_topics[num_topic][topic]:
                ground_text = ground_text + documents_lemmatized[i] + ' . '
                actual_entities_dict = convert_to_dict(extract_np_sequences(all_tab_pos[i]))
                for ent in actual_entities_dict:
                    if len(ent) > 1:
                        if ent in entities_dict:
                            entities_dict[ent] += actual_entities_dict[ent]
                        else:
                            entities_dict[ent] = actual_entities_dict[ent]

            obj_nlp = nlp_for_ngrams(ground_text)
            terms = extract_terms([obj_nlp], 10)

            all_terms[num_topic][topic] = terms
            all_entities[num_topic][topic] = entities_dict

    all_true_entities = {}
    for num_topic in all_entities:
        all_true_entities[num_topic] = {}
        for topic in all_entities[num_topic]:
            all_true_entities[num_topic][topic] = []
            for ent in all_entities[num_topic][topic]:
                actual_sum = 0
                for topic2 in all_entities[num_topic]:
                    if topic2 != topic:
                        for ent2 in all_entities[num_topic][topic2]:
                            if ent2 == ent:
                                actual_sum += all_entities[num_topic][topic2][ent2]

                if actual_sum > 0:
                    all_true_entities[num_topic][topic].append([ent, all_entities[num_topic][topic][ent]/sqrt(actual_sum)])
                else:
                    all_true_entities[num_topic][topic].append([ent, all_entities[num_topic][topic][ent]])

    save_to_disk_terms_or_entities(all_terms, 'terms')
    save_to_disk_terms_or_entities(all_true_entities, 'entities')

In [None]:
def extract_np_sequences(atb):
   # unigrams = {}
#all_false_candidates_for_unigrams = {}

    pre_all_added_ngrams = []
    ngram = []
    det_flag = False  # Un drapeau pour vérifier si nous avons rencontré un 'det'

    for pair in atb:
        token, func = pair

        # Si la fonction est 'np', ajouter le token à la séquence
        if token in unigrams_proper_nouns:
            if det_flag and len(ngram) > 0:
                ngram.append(det_token)
                det_flag = False

            ngram.append(token)

        # Si la fonction est 'det' et qu'il y a déjà un 'np' dans la séquence
        elif func == 'det' and len(ngram) > 0:
            if not det_flag:
                det_token = token
                det_flag = True
            else:
                # Si un deuxième 'det' est trouvé, vérifiez la validité de la séquence avant de la réinitialiser
                if len(ngram) >= 1:
                    pre_all_added_ngrams.append(' '.join(ngram))
                ngram = []
                det_flag = False

        # Si nous rencontrons une fonction autre que 'np' ou 'det', vérifiez la validité de la séquence
        else:
            if len(ngram) >= 1:
                pre_all_added_ngrams.append(' '.join(ngram))

            ngram = []
            det_flag = False

    # Pour les suites en fin de tableau
    if len(ngram) >= 1:
        pre_all_added_ngrams.append(' '.join(ngram))
        ngram = []

    return pre_all_added_ngrams

In [None]:
def save_to_disk_terms_or_entities(main_dict, kind):
    global topic_model_terms_output_name

    main_dict = dict(sorted(main_dict.items()))

    # Parcourez chaque sous-dictionnaire
    for sub_dict_name, sub_dict in main_dict.items():

        sub_dict = dict(sorted(sub_dict.items()))

        for key, value in sorted(sub_dict.items()):
            sorted_value = sorted(value, key=lambda x: x[1], reverse=True)
            sub_dict[key] = sorted_value[:10]

        # Créez un chemin pour le fichier correspondant au sous-dictionnaire
        if kind == 'entities':
            file_path = results_path + base_name + f"_topic_model_entities_output_name_{sub_dict_name}t.csv"
        else:
            file_path = results_path + base_name + f"_topic_model_terms_output_name_{sub_dict_name}t.csv"

        # Ouvrez le fichier pour écrire
        with open(file_path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Créez les en-têtes basés sur les clés du sous-dictionnaire
            headers = []
            for key in sub_dict.keys():
                if kind == 'entities':
                    headers.extend([f"{key}_entities", f"{key}_scores"])
                else:
                    headers.extend([f"{key}_terms", f"{key}_scores"])

            writer.writerow(headers)

            # Trouvez le tableau le plus long pour déterminer le nombre de lignes
            max_length = max(len(value) for value in sub_dict.values())

            # Écrivez les données
            for i in range(max_length):
                row = []
                for key, value in sub_dict.items():
                    if i < len(value):
                        row.extend(value[i])
                    else:
                        row.extend([None, None])  # Ajoutez des valeurs vides si le tableau est plus court
                writer.writerow(row)

In [None]:
def create_topic_barplots():
    global original_labels
    dict_final = {}

    for key, sub_dict in every_topics_and_scores_by_document.items():
        new_dict = {}
        sum_dict = {}

        # Calculez la somme et le nombre d'occurrences pour chaque élément
        for tuple_list in sub_dict.values():
            for t in tuple_list:
                if t[0] in new_dict:
                    new_dict[t[0]].append(t[1])
                else:
                    new_dict[t[0]] = [t[1]]

        # Calculer la somme et la variance pour chaque élément
        for topic, values in new_dict.items():
            n = len(values)
            mean = sum(values) / n
            variance = sum((x - mean) ** 2 for x in values) / n
            sum_dict[topic] = (sum(values), variance)

        dict_final[key] = sum_dict

    sns.set_style("white")  # Fond blanc, pas de grille

    variances = {}
    y_values = {}

    cmap = plt.cm.coolwarm

    for key in dict_final:
        # Trier le sous-dictionnaire basé sur la première valeur du tuple
        sorted_items = sorted(dict_final[key].items(), key=lambda x: x[1][0])
        sorted_keys = [k for k, _ in sorted_items]

        # Récupérer les chaînes correspondantes depuis original_labels
        sorted_strings = [original_labels[key][k] for k in sorted_keys]
        sorted_strings = sorted_strings[::-1]

        # Extraire les variances en suivant l'ordre des clés triées
        sorted_variances = [dict_final[key][k][1] for k in sorted_keys]
        sorted_variances = sorted_variances[::-1]

        variances[key] = sorted_variances

        sorted_first_values = [dict_final[key][k][0] for k in sorted_keys]
        sorted_first_values = sorted_first_values[::-1]
        y_values[key] = sorted_first_values

        # Afficher les résultats
        print(f"Clé {key}: Chaînes triées: {sorted_strings}")
        print(f"Clé {key}: Variances triées: {variances[key]}")
        print(f"Clé {key}: Y triées: {y_values[key]}")

        # Normaliser les variances pour obtenir des valeurs entre 0 et 1
        normed_variances = np.interp(variances[key], (min(variances[key]), max(variances[key])), (0, 1))

        # Convertir les variances normalisées en couleurs
        colors = [cmap(v) for v in normed_variances]

        num_bars = len(sorted_strings)
        height_per_bar = 0.5
        total_height = num_bars * height_per_bar

        plt.figure(figsize=(4, total_height), dpi=150)

        sns.barplot(y=np.array(sorted_strings),
                    x=np.array(y_values[key]),
                    hue=np.array(sorted_strings),
                    palette=colors,
                    orient='h',
                    dodge=False)

        plt.legend([],[], frameon=False)  # Cela cache la légende

        plt.xticks([])
        plt.box(False)

        plt.tight_layout()
        plt.savefig(f"{results_path}{base_name}_{key}t_barplot.png", format='png', dpi=300, bbox_inches='tight')  # Sauvegarde le plot
        plt.close()

In [None]:
def create_umap_best_ngrams(unigrams_nb, n_neighbors):
    global unigrams_common_nouns

    _, tfidf, tfidf_feature_names, _ = go_tfidf_vectorization(unigrams_common_nouns)

    sum_tfidf = tfidf.sum(axis=0)
    average_tfidf = sum_tfidf / tfidf.shape[0]
    average_tfidf = np.array(average_tfidf).flatten()
    sorted_indices = average_tfidf.argsort()[::-1]
    top_50_indices = sorted_indices[:unigrams_nb]
    X_reduced = tfidf[:, top_50_indices]
    similarity_matrix = cosine_similarity(X_reduced.T)
    embedding = umap.UMAP(n_neighbors=n_neighbors).fit_transform(similarity_matrix)

    std_devs = np.std(tfidf.toarray(), axis=0)
    color_values_array = [-1 for _ in range(embedding.shape[0])]

    nb = 0
    for idx in top_50_indices:
        color_values_array[nb] = std_devs[idx]
        nb += 1

    min_value = np.min(color_values_array)
    sizes = 5 * (color_values_array / min_value)

    sns.set_style("whitegrid")

    plt.figure(figsize=(15, 12))

    sns.scatterplot(x=embedding[:, 0], y=embedding[:, 1], marker="", legend=False)

    norm = plt.Normalize(np.min(color_values_array), np.max(color_values_array))
    colors = plt.cm.winter(norm(color_values_array))

    texts = []
    from tqdm import tqdm

    for i, multigram in enumerate(tfidf_feature_names[top_50_indices]):
        lemma = (multigram)
        texts.append(plt.text(embedding[i, 0], embedding[i, 1], lemma, size=8, c=colors[i], ha='center', va='center'))


    adjust_text(texts)

    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)

    plt.gca().xaxis.set_ticks([])
    plt.gca().yaxis.set_ticks([])

    all_std_by_name = {}
    for idx in top_50_indices:
        all_std_by_name[tfidf_feature_names[idx]] = std_devs[idx]

    with open(f"{results_path}{base_name}_main_common_nouns_tfidf_std.csv", "w", newline='', encoding='utf-8') as fichier_csv:
        writer = csv.writer(fichier_csv)

        # Parcourir le dictionnaire et écrire chaque paire clé-valeur
        for cle, valeur in all_std_by_name.items():
            writer.writerow([cle, valeur])

    plt.savefig(f"{results_path}{base_name}_umap_{unigrams_nb}un_{n_neighbors}nn_network.png", dpi=300, bbox_inches='tight')
    plt.close()

In [None]:
def determine_unigrams():
    global unigrams
    global unigrams_common_nouns
    global unigrams_nouns
    global unigrams_proper_nouns

    unigrams = update_candidates_for_unigram('np', unigrams)
    unigrams_nouns = update_candidates_for_unigram('np', unigrams_nouns)
    unigrams_proper_nouns = update_candidates_for_unigram('np', unigrams_proper_nouns)

    unigrams = update_candidates_for_unigram('nc', unigrams)
    unigrams_nouns = update_candidates_for_unigram('nc', unigrams_nouns)
    unigrams_common_nouns = update_candidates_for_unigram('nc', unigrams_common_nouns)

    unigrams = update_candidates_for_unigram('v', unigrams)

In [None]:
def write_raw_documents():
  with open(raw_documents_output_name, "w", encoding='utf-8') as file_object:
    for dfn in documents:
      file_object.write(dfn + '\n')

In [None]:
def write_lemmatized_documents():
  with open(lemmatized_documents_output_name, "w", encoding='utf-8') as file_object:
    for dfn in documents_lemmatized:
      file_object.write(dfn + '\n')

In [None]:
def extract_date_from_text(date_text):
    # Vérifier si le texte est déjà au format 'XX/XX/XXXX'

    if language == 'fr':
        if re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_text):
            return date_text

        # Si ce n'est pas le cas, essayer de le convertir
        pattern = r"(\d{1,2})\s+([a-zA-Zéû]+)\s+(\d{4})"
        match = re.search(pattern, date_text)

        if match:
            day, month, year = match.groups()

            # Correction des éventuelles typos ou mots manquants dans les mois
            if month.lower() == "aot":
                month = "août"

            month_dict = {
                'janvier': '01', 'février': '02', 'mars': '03', 'avril': '04', 'mai': '05',
                'juin': '06', 'juillet': '07', 'août': '08', 'septembre': '09', 'octobre': '10',
                'novembre': '11', 'décembre': '12'
            }

            month_num = month_dict.get(month.lower())
            if month_num:
                return f"{day}/{month_num}/{year}"
    elif language == 'en':
        if re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_text):
            return date_text

        pattern = r"(\d{1,2})\s+([a-zA-Z]+)\s+(\d{4})"
        match = re.search(pattern, date_text)

        if match:
            day, month, year = match.groups()

            month_dict = {
                'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05',
                'june': '06', 'july': '07', 'august': '08', 'september': '09', 'october': '10',
                'november': '11', 'december': '12'
            }

            month_num = month_dict.get(month.lower())
            if month_num:
                return f"{day}/{month_num}/{year}"

    elif language == 'pt':
        if re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_text):
            return date_text

        # Modèle pour le format "DD Month YYYY"
        pattern = r"(\d{1,2})\s+([a-zA-Zçûá]+)\s+(\d{4})"
        match = re.search(pattern, date_text)

        if match:
            day, month, year = match.groups()

            # Dictionnaire des mois en portugais
            month_dict = {
                'janeiro': '01', 'fevereiro': '02', 'março': '03', 'abril': '04', 'maio': '05',
                'junho': '06', 'julho': '07', 'agosto': '08', 'setembro': '09', 'outubro': '10',
                'novembro': '11', 'dezembro': '12'
            }

            month_num = month_dict.get(month.lower())
            if month_num:
                return f"{day}/{month_num}/{year}"

    elif language == 'jp':
        # Modèle pour le format japonais "YYYY年MM月DD日"
        pattern = r"(\d{1,4})年(\d{1,2})月(\d{1,2})日"
        match = re.search(pattern, date_text)

        if match:
            year, month, day = match.groups()
            return f"{year}/{month.zfill(2)}/{day.zfill(2)}"

    elif language == 'es':
        if re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_text):
            return date_text

        # Modèle pour le format "DD Month YYYY"
        pattern = r"(\d{1,2})\s+([a-zA-Z]+)\s+(\d{4})"
        match = re.search(pattern, date_text)

        if match:
            day, month, year = match.groups()

            # Dictionnaire des mois en espagnol
            month_dict = {
                'enero': '01', 'febrero': '02', 'marzo': '03', 'abril': '04', 'mayo': '05',
                'junio': '06', 'julio': '07', 'agosto': '08', 'septiembre': '09', 'octubre': '10',
                'noviembre': '11', 'diciembre': '12'
            }

            month_num = month_dict.get(month.lower())
            if month_num:
                return f"{day}/{month_num}/{year}"


    return date_text  # Retourner le texte original s'il ne peut pas être converti

In [None]:
def extract_information(header, selector):
    elements = header.select(selector)
    if elements:
        return "////".join([get_text_from_tag(el).replace(';', ',') for el in elements])
    else:
        return "N/A"

In [None]:
def get_text_from_tag(tag):
    return ''.join(tag.strings)

In [None]:
def normalize_journal(t):
    t = t.strip()

    # Supprimer tout ce qui se trouve après la première virgule
    t = re.sub(r',.*', '', t)

    # Supprimer tout ce qui se trouve après le premier tiret
    t = re.sub(r'-.*', '', t)

    # Supprimer tout ce qui se trouve entre parenthèses (y compris les parenthèses)
    t = re.sub(r'\(.*?\)', '', t)

    # Supprimer tout ce qui suit trois espaces vides ou plus
    t = re.sub(r' {3,}.*', '', t)

    # Trim le texte
    t = t.strip()

    return t.lower()

In [None]:
import re

def extract_date_info(date_text, language='fr'):
    if language == 'fr':
        regex = "([1-3]?[0-9]\\s(janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\\s20[0-2][0-9])"
    elif language == 'en':
        regex = "([1-3]?[0-9]\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s20[0-2][0-9])"

    date_text_clean = re.search(regex, date_text)
    return date_text_clean.group() if date_text_clean else date_text

In [None]:
import re

def normalise_date(date_text):
    global language

    # Dictionnaires pour la conversion des mois
    month_dict = {
        'en': {
            'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05',
            'june': '06', 'july': '07', 'august': '08', 'september': '09', 'october': '10',
            'november': '11', 'december': '12'
        },
        'fr': {
            'janvier': '01', 'février': '02', 'mars': '03', 'avril': '04', 'mai': '05',
            'juin': '06', 'juillet': '07', 'août': '08', 'septembre': '09', 'octobre': '10',
            'novembre': '11', 'décembre': '12'
        }
    }

    # Regex pour extraire la date, adaptée à l'exemple fourni
    pattern = r"([a-zA-Z]+)\s+(\d{1,2}),\s+(\d{4})"
    match = re.search(pattern, date_text)

    if match:
        month, day, year = match.groups()
        month = month.lower()

        # Convertir le mois en chiffre en utilisant le dictionnaire
        if month in month_dict[language]:
            month_num = month_dict[language][month]
            return f"{day}/{month_num}/{year}"

    # Retourner le texte original si aucun motif n'est trouvé
    return date_text

In [None]:
def standardize_name(name):
    words = name.split()
    words.sort()
    return ' '.join(words)

In [None]:
def split_names(s):
    words = s.split()
    if len(words) == 4:
        first_name = ' '.join(words[:2])
        second_name = ' '.join(words[2:])
        return [first_name, second_name]
    elif len(words) == 6:
        first_name = ' '.join(words[0:2])
        second_name = ' '.join(words[2:4])
        third_name = ' '.join(words[4:6])
        return [first_name, second_name, third_name]
    elif len(words) == 8:
        first_name = ' '.join(words[0:2])
        second_name = ' '.join(words[2:4])
        third_name = ' '.join(words[4:6])
        fourth_name = ' '.join(words[4:6])
        return [first_name, second_name, third_name, fourth_name]

    return [s]

In [None]:
def extract_names(line):
    if len(line) > 150:
        return None

    # Supprimer tout ce qui est entre parenthèses
    line = re.sub(r'\(.*?\)', '', line)

    # Ignorer les lignes qui contiennent des domaines ou "N/A"
    if re.search(r'(\.fr|\.com|n/a)', line):
        return None

    # Supprimer les occurrences de "Envoyé[e] spécial[e] à X"
    line = re.sub(r'envoyé[e]? spécial[e]? à \w+', '', line)

    line = re.sub(r'correspondant[e]? à \w+', '', line)
    line = re.sub(r'correspondant[e]? en \w+', '', line)
    line = re.sub(r'recueilli par', '', line)

    line = line.replace('rédacteur en chef à', '')
    line = line.replace('rédactrice en chef à', '')

    line = line.replace('correspondant européen', '')
    line = line.replace('correspondant étranger', '')

    line = line.replace("la rédaction de l'agence", '')

    line = re.sub(r'\s?@\w+', '', line)

    line = line.replace('par', '')
    line = line.replace('.', '')

    line = line.replace('"', '')
    line = line.replace('«', '')
    line = line.replace('»', '')
    line = re.sub(r'\s+', ' ', line).strip()

    line = line.replace('directeur du monde', '')
    line = line.replace('propos recueillis', '')
    line = line.replace('cofondateur de libération', '')
    line = line.replace('e. gr.', '')
    line = line.replace('les echos', '')
    line = line.replace('repères', '')
    line = line.replace("l'air du large", '')
    line = line.replace('interview', '')
    line = line.replace("professeur agrégé d'histoire à sciences po, docteur en géographie", '')


    line = line.replace('la croix', '')

    line = line.replace('york', '')
    line = line.replace('avec', '')

    # Si la ligne contient "////", supprimez tout ce qui est à gauche et "////" lui-même
    if "////" in line:
        line = line.split("////")[1].strip()

    line = line.replace(',', ', ')
    line = re.sub(r'\s+', ' ', line).strip()

    # Si la ligne contient des virgules ou "et", divisez la ligne et prenez les noms

    names = []
    if len(line.split()) > 3:
        parts = re.split(',| et', line)
        for part in parts:
            names.extend(split_names(part.strip()))
    else:
        line = line.replace(',', '')
        names.extend(split_names(line.strip()))

    return set(names)

In [None]:
def write_info_europresse_lemmas(lemmas_dict, topic_nums, article, actual_doc):
    all_lemmas = []

    header = article.header

    title_text = extract_information(header, '.titreArticle p')
    journal_text = extract_information(header, '.rdp__DocPublicationName')
    date_text = extract_information(header, '.DocHeader')

    journal_text = normalize_journal(journal_text)
    date_text_clean = extract_date_info(date_text)
    date_normalized = normalise_date(date_text_clean).replace(';', '').replace('&', '')
    date_normalized = extract_date_from_text(date_normalized)

    formatted_list = ["{};{}".format(topic, score) for topic, score in topic_nums]
    topics_str = ";".join(formatted_list)

    all_names = None
    names = extract_names(extract_information(header, '.sm-margin-bottomNews').lower())
    if names:
        actual_names = [standardize_name(name) for name in names]
        filtered_names = [name for name in actual_names if not any(other_name != name and set(name.split()) < set(other_name.split()) for other_name in actual_names)]
        all_names = filtered_names

    if all_names is None:
        chaine_authors = "None"
    else:
        chaine_authors = ', '.join(map(str, all_names))

    for lemmapos in lemmas_dict:
        all_lemmas.append(f"{lemmapos[0]};{lemmapos[1]};{lemmas_dict[lemmapos]};{title_text.replace(';', '')};{chaine_authors};{extract_information(header, '.sm-margin-bottomNews').lower()};{len(actual_doc)};{journal_text.replace(';', '')};{date_normalized};{topics_str}")

    return all_lemmas

In [None]:
def write_info_europresse(topic_nums, article, actual_doc):
    header = article.header

    title_text = extract_information(header, '.titreArticle p')
    journal_text = extract_information(header, '.rdp__DocPublicationName')
    date_text = extract_information(header, '.DocHeader')

    journal_text = normalize_journal(journal_text)
    date_text_clean = extract_date_info(date_text)
    date_normalized = normalise_date(date_text_clean).replace(';', '').replace('&', '')
    date_normalized = extract_date_from_text(date_normalized)

    formatted_list = ["{};{}".format(topic, score) for topic, score in topic_nums]
    topics_str = ";".join(formatted_list)

    all_names = None
    names = extract_names(extract_information(header, '.sm-margin-bottomNews').lower())
    if names:
        actual_names = [standardize_name(name) for name in names]
        filtered_names = [name for name in actual_names if not any(other_name != name and set(name.split()) < set(other_name.split()) for other_name in actual_names)]
        all_names = filtered_names

    if all_names is None:
        chaine_authors = "None"
    else:
        chaine_authors = ', '.join(map(str, all_names))

    return f"{title_text.replace(';', '')};{chaine_authors};{extract_information(header, '.sm-margin-bottomNews').lower()};{len(actual_doc)};{journal_text.replace(';', '')};{date_normalized};{topics_str}"

In [None]:
def write_info_another(topic_nums, columns_dicts, i, actual_doc):

    formatted_list = ["{};{}".format(topic, score) for topic, score in topic_nums]
    topics_str = ";".join(formatted_list)

    row = ''
    for key_title in columns_dicts:
        row += str(columns_dicts[key_title][i]) + ';'

    return f"{row}{len(actual_doc)};{topics_str}"

In [None]:
def write_info_another_lemmas(lemmas_dict, topic_nums, columns_dicts, i, actual_doc):
    all_lemmas = []

    formatted_list = ["{};{}".format(topic, score) for topic, score in topic_nums]
    topics_str = ";".join(formatted_list)

    row = ''
    for key_title in columns_dicts:
        row += str(columns_dicts[key_title][i]) + ';'

    for lemmapos in lemmas_dict:
        all_lemmas.append(f"{lemmapos[0]};{lemmapos[1]};{lemmas_dict[lemmapos]};{row}{len(actual_doc)};{topics_str}")

    return all_lemmas

In [None]:
def lister_html_insensible(dossier):
    return [f for f in os.listdir(dossier) if f.lower().endswith('.html') and os.path.isfile(os.path.join(dossier, f)) and base_name in f]

In [None]:
def remove_urls_hashtags_emojis_mentions_emails(text):
    # Supprimer les URLs
    text = re.sub(r'https?://\S+', '', text)

    # Supprimer les hashtags
   # text = re.sub(r'#\w+', '', text)

    # Supprimer les mentions
  #  text = re.sub(r'@\w+', '', text)

    # Supprimer les e-mails
 #   text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # Supprimer les émojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
 #   text = emoji_pattern.sub(r'', text)

    return text

In [None]:
def write_lemmatized_documents():
    with open(lemmatized_documents_output_name, "w", encoding='utf-8') as file_object:
        for d in documents_lemmatized:
            file_object.write(d + '\n')

In [None]:
def extract_info(topic_nums, article):
    header = article.header

    date_text = extract_information(header, '.DocHeader')
    date_text_clean = extract_date_info(date_text)
    date_normalized = normalise_date(date_text_clean).replace(';', '').replace('&', '')
    date_normalized = extract_date_from_text(date_normalized)

    topics_dict = dict(topic_nums)

    return {date_normalized: topics_dict}

In [None]:
def aggregate_scores(articles_info):
    aggregated_scores = {}

    for info in articles_info:
        for date, topics in info.items():
            if date not in aggregated_scores:
                aggregated_scores[date] = {}

            for topic, score in topics.items():
                if topic not in aggregated_scores[date]:
                    aggregated_scores[date][topic] = 0
                aggregated_scores[date][topic] += score

    return aggregated_scores

In [None]:
def count_articles_by_date(articles_info):
    article_counts = {}

    for info in articles_info:
        for date in info.keys():
            if date not in article_counts:
                article_counts[date] = 0
            article_counts[date] += 1

    return article_counts

In [None]:
def normalize_scores_by_article_count(aggregated_scores, aggregated_article_counts):
    normalized_scores = {}

    for date, topics in aggregated_scores.items():
        if date not in normalized_scores:
            normalized_scores[date] = {}

        for topic, score in topics.items():
            normalized_scores[date][topic] = score / aggregated_article_counts[date]

    return normalized_scores

In [None]:
def create_chrono_topics(normalize_by_date,
                         relative_normalizaton,
                         sigma):
    global all_nmf_W
    global original_labels
    global every_topics_and_scores_by_document
    global is_europresse
    global columns_dicts

    from sklearn.metrics.pairwise import manhattan_distances


    for num_topic in tqdm(all_nmf_W, bar_format="{l_bar}{bar:10}{r_bar}", desc="TOPICS CONFIGURATIONS PROCESSED"):
        if is_europresse:
            articles_info = [extract_info(every_topics_and_scores_by_document[num_topic][i], article) for i, article in enumerate(all_soups)]
        else:
            from dateutil.parser import parse

            def detecter_date(chaine, jour_en_premier=True):
                try:
                    return parse(chaine, dayfirst=jour_en_premier)
                except ValueError:
                    return None

            def formater_date(date):
                return date.strftime('%d/%m/%Y')

            def formater_liste_dates(liste_dates, jour_en_premier=True):
                return [formater_date(detecter_date(date_str, jour_en_premier)) for date_str in liste_dates if detecter_date(date_str, jour_en_premier)]

            articles_info = []
            all_dates = formater_liste_dates(columns_dicts['date'])
            i = 0
            while i < len(all_dates):
                articles_info.append({all_dates[i]: dict(every_topics_and_scores_by_document[num_topic][i])})
                i += 1

        aggregated_scores = aggregate_scores(articles_info)

        def extract_and_convert_date(date_str):
            # Utilisation d'une expression régulière pour extraire la date
            match = re.search(r'\d{2}/\d{2}/\d{4}', date_str)
            if match:
                return datetime.strptime(match.group(), '%d/%m/%Y')
            else:
                # Gérer les cas où aucune date n'est trouvée
                return None

        # Filtrer et convertir les dates
        valid_dates = {}
        for date_str, score in aggregated_scores.items():
            date = extract_and_convert_date(date_str)
            if date:
                valid_dates[date.strftime('%d/%m/%Y')] = score

        # Trier les dates valides
        # sorted_dates = sorted(valid_dates.keys(), key=lambda date: datetime.strptime(date, '%d/%m/%Y'))
        aggregated_scores_sorted = {date: valid_dates[date]
                                    for date in sorted(valid_dates,
                                                    key=lambda date: datetime.strptime(date, '%d/%m/%Y'))}

        #aggregated_scores_sorted = {date: valid_dates[date] for date in sorted_dates}
        aggregated_article_counts = count_articles_by_date(articles_info)

        with open(f"{results_path}{base_name}_aggregated_scores_sorted_{num_topic}.json", 'w', encoding='utf-8') as file:
            json.dump(aggregated_scores_sorted, file, indent=4)

        if normalize_by_date:
            aggregated_scores_sorted = normalize_scores_by_article_count(aggregated_scores_sorted, aggregated_article_counts)

            with open(f"{results_path}{base_name}_aggregated_scores_sorted_normalized_by_date_{num_topic}.json",
                    'w',
                    encoding='utf-8') as file:
                json.dump(aggregated_scores_sorted, file, indent=4)

        df = pd.DataFrame(aggregated_scores_sorted)
        df_normalized = df.div(df.max(axis=1), axis=0)

        # Convertir les colonnes en type datetime pour une manipulation plus aisée
        df_normalized.columns = pd.to_datetime(df_normalized.columns, dayfirst=True)

        # Convertir les clés en dates et trouver la plus éloignée dans le temps
        dates = [datetime.strptime(date, "%d/%m/%Y") for date in aggregated_scores_sorted.keys()]
        newest_date = max(dates)

        newest_date.strftime("%d/%m/%Y")

        # Convertir les clés en dates et trouver la plus éloignée dans le temps
        dates = [datetime.strptime(date, "%d/%m/%Y") for date in aggregated_scores_sorted.keys()]
        oldest_date = min(dates)

        oldest_date.strftime("%d/%m/%Y")

        date_range = pd.date_range(start=oldest_date,
                                end=newest_date)

        # Réindexer le dataframe pour inclure toutes les dates dans le range, en remplissant les valeurs manquantes par 0
        df_normalized = df_normalized.reindex(columns=date_range, fill_value=0)

        #sigma = len(df_normalized.columns)/10

        list_of_series = []

        for index, row in df_normalized.iterrows():
            if sigma == 'auto':
                filtered_values = gaussian_filter(row, sigma=(df_normalized.std(axis=1).mean()*df_normalized.std(axis=1).mean())*300)
            else:
                filtered_values = gaussian_filter(row, sigma=sigma)

            s = pd.Series(filtered_values, index=df_normalized.columns, name=index)
            list_of_series.append(s)

        df_normalized = pd.concat(list_of_series, axis=1).T

        if relative_normalizaton:
            list_of_series = []

            for index, row in df_normalized.iterrows():
                normalized_values = (row - row.min()) / (row.max() - row.min())
                s = pd.Series(normalized_values, index=df_normalized.columns, name=index)
                list_of_series.append(s)

            df_normalized = pd.concat(list_of_series, axis=1).T

        dist_matrix = manhattan_distances(df_normalized.values)
        condensed_dist_matrix = squareform(dist_matrix)

        Z = linkage(condensed_dist_matrix, method='complete', optimal_ordering=True)   # method='ward')
        dendro = dendrogram(Z, no_plot=True)
        df_normalized = df_normalized.iloc[dendro['leaves']]
        reordered_labels = [original_labels[num_topic][i] for i in np.array(df_normalized.index)]

        #plt.figure(figsize=(20, 12))
        fig, ax = plt.subplots(figsize=(20, 12))

        ax = sns.heatmap(df_normalized, cmap="YlGnBu", cbar=False)

    #  ax.set_yticklabels(reordered_labels, rotation=0)

        offset_x = int(len(dates) / 100)
        offset_x = 0
        for index, label in enumerate(reordered_labels):
            plt.text(x=-offset_x,
                    y=(index + 0.5),
                    s=label,
                    rotation=0,
                    ha='right',
                    va='center',
                    fontname='Liberation Mono')


        # Calcul de l'intervalle dynamique pour les dates
        interval = calculate_date_interval(df_normalized.T,
                                        figsize=(20, 12))

        longest_string = max(reordered_labels, key=len)

        # Étape 2: Obtenir la longueur du string le plus long
        longest_string_length = len(longest_string)

        if interval < 2:
            interval = 2

        interval = int(interval * (0.0135*longest_string_length + 0.9865))

        # Configuration de l'axe des abscisses avec l'intervalle dynamique
        ax.xaxis.set_major_locator(mdates.DayLocator(interval=interval))

        for spine in ax.spines.values():
            spine.set_visible(False)

        ticks = ax.get_xticks()

      #  ticks = ticks[:-1]

        labels = [date.strftime('%d-%m-%Y') for date in mdates.num2date(ticks)]

        ax.set_xlabel('')  # Supprimer le titre de l'axe des x
        ax.set_ylabel('')  # Supprimer le titre de l'axe des y
        ax.set_title('')  # Supprimer le titre du graphique
        ax.set_xticks([])  # Supprime les ticks de l'axe x
        ax.set_yticks([])  # Supprime les ticks de l'axe y

        unix_base_date = datetime(1970, 1, 1)

        base_date = datetime.date(df_normalized.T.index.min())

        # Convertir les labels de dates et les ajuster
        adjusted_dates = []
        for label in labels:
            # Convertir le label en objet datetime
            label_date = datetime.strptime(label, "%d-%m-%Y")

            # Calculer la différence en jours
            delta_days = (label_date - unix_base_date).days

            # Ajouter cette différence à la date de base de votre corpus
            new_date = base_date + timedelta(days=delta_days)

            # Ajouter à la liste ajustée (formattez si nécessaire)
            adjusted_dates.append(new_date.strftime("%d-%m-%Y"))


        # Utilisez une boucle pour positionner chaque étiquette
    #    for i, label in enumerate(adjusted_dates):
     #       ax.text(ticks[i],
      #              len(reordered_labels),
       #             label,
        #            rotation=90,
         #           ha='left',
          #          va='top',
           #         fontname='Liberation Mono')

        for i, label in enumerate(adjusted_dates):
            # Créez un objet texte (mais ne l'affichez pas encore)
            text = ax.text(ticks[i],
                        len(reordered_labels),
                        label,
                        rotation=90,
                        ha='left',
                        va='top',
                        fontname='Liberation Mono')

            # Récupérez la largeur du texte en pixels
            text_width_px = text.get_window_extent(renderer=fig.canvas.get_renderer()).width

            # Convertir la largeur du texte en pixels en coordonnées de données
            dx, dy = ax.transData.inverted().transform((text_width_px, 0)) - ax.transData.inverted().transform((0, 0))

            # Vérifie si le tick + largeur du texte dépasse les limites de l'axe X
            if ticks[i] + (dx + (dx*0.1)) <= ax.get_xlim()[1]:
                # Si c'est le cas, affichez le texte
                text.set_visible(True)
            else:
                # Sinon, n'affichez pas le texte
                text.set_visible(False)


        plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
        plt.tight_layout(pad=1.0, h_pad=1.0, w_pad=1.0, rect=[0, 0, 1, 1])

        plt.savefig(f"{results_path}{base_name}_{num_topic}t_{normalize_by_date}dn_{relative_normalizaton}rn_{sigma}s_heatmap.png",
                    dpi=300,
                    bbox_inches='tight',
                    pad_inches=0)
        plt.close()

In [None]:
def load_documents(name, is_europresse, minimum_caracters_nb_by_document, pbar, lock):
  documents = []
  all_soups = []
  columns_dict = {}

  if is_europresse:
    document_europresse = ''

    with open(name, 'r', encoding='utf-8', errors='xmlcharrefreplace') as file:
      for line in file:
        document_europresse += line

    document_europresse = html.unescape(document_europresse)

    document_europresse = document_europresse.replace('</article> <article>', '</article><article>')

    documents_europresse = document_europresse.split('</article><article>')

    nb_not_occur = 0
    for d in documents_europresse:
      soup = BeautifulSoup(d, features="html.parser")

      with lock:
        pbar.update(1)

      for p in soup.find_all('p'):
        p_text = p.get_text()
        if ("Lire aussi" in p_text and ("http" in p_text or "https" in p_text) and len(p_text) <= 1000):
          p.decompose()

      if len(soup('div', {'class': 'docOcurrContainer'})) > 0:
        for p in soup.find_all('p'):
          # Trouver le prochain caractère alphabétique après le paragraphe qui n'est pas à l'intérieur d'une balise
          next_char_match = re.search(r'(?<=' + re.escape(p.text) + r')\s*(?:<[^>]*>)*\s*([a-zA-Z])', str(soup))

          # Si le paragraphe ne se termine pas par un point et que le prochain caractère alphabétique est en majuscule
          if not p.text.endswith('.') and next_char_match and next_char_match.group(1).isupper():
            p.string = p.text + '. '

        soup = BeautifulSoup(str(soup), features='html.parser')

        candidate_text = soup('div', {'class': 'docOcurrContainer'})[0].get_text()
        if len(candidate_text) >= minimum_caracters_nb_by_document and len(candidate_text) < maximum_caracters_nb_by_document:
          candidate_text = remove_urls_hashtags_emojis_mentions_emails(candidate_text)
          candidate_text = transform_text(candidate_text)
          documents.append(candidate_text)
          all_soups.append(soup)
      else:
        nb_not_occur += 1
  else:
    def detect_delimiter(filename):
        with open(filename, 'r', newline='', encoding='utf-8') as file:
            sample = file.readline()
            sniffer = csv.Sniffer()
            return sniffer.sniff(sample).delimiter

    delimiter = detect_delimiter(name)

    df = pd.read_csv(name, delimiter=delimiter, quotechar='"')
    df.columns = df.columns.str.lower()
    df = df.rename(columns={'post created date': 'date'})
    df.fillna('', inplace=True)

    if 'text' in df:
        df = df.loc[(df['text'].str.len() >= minimum_caracters_nb_by_document) & (df['text'].str.len() <= maximum_caracters_nb_by_document)]
        documents = df['text'].tolist()
    else:
        if 'description' in df:
            df = df.loc[(df['description'].str.len() >= minimum_caracters_nb_by_document) & (df['description'].str.len() <= maximum_caracters_nb_by_document)]
            documents = df['description'].tolist()

    i = 0
    while i < len(documents):
      documents[i] = remove_urls_hashtags_emojis_mentions_emails(documents[i])
      documents[i] = transform_text(documents[i])
      i += 1

    for column in df.columns:
        if column != 'text' and column != 'description':
            columns_dict[column] = df[column].tolist()

  return documents, all_soups, columns_dict

In [None]:
def parallel_function(fichier_html, is_europresse, pbar, lock):
    return (load_documents(folder_path + 'DATA/' + fichier_html, is_europresse,
                           minimum_caracters_nb_by_document, pbar, lock))

In [None]:
def meta_load_documents():
    global documents
    global all_soups
    global columns_dicts
    global is_europresse

    # List files that have the base name in them
    # Useful for fragmented corpuses
    if is_europresse:
        fichiers_html = lister_html_insensible(f"{folder_path}DATA/")
    else:
        fichiers_html = [f for f in os.listdir(f"{folder_path}DATA/") if f.lower().endswith('.csv') and os.path.isfile(os.path.join(f"{folder_path}DATA/", f)) and base_name in f]

    # tqdm global for all documents.
    pbar = tqdm(bar_format="{l_bar}{bar:10}{r_bar}", position=0, maxinterval=2000, desc='DOCUMENTS PROCESSED')
    lock = threading.Lock()  # A lock to prevent conflicts when updating tqdm

    all_columns_dicts = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(parallel_function, f, is_europresse, pbar, lock) for f in fichiers_html]
        for future in concurrent.futures.as_completed(futures):
            d, s, cd = future.result()
            documents.extend(d)
            all_soups.extend(s)
            all_columns_dicts.append(cd)

    for dico in all_columns_dicts:
        for cle, valeur in dico.items():
            if cle not in columns_dicts:
                columns_dicts[cle] = []
            columns_dicts[cle].extend(valeur)

   # pbar.close()

In [None]:
def update_candidates_for_unigram(kind, unigrams):
    global all_tab_pos

    if kind == 'np':
        all_tab_pos_for_np = copy.deepcopy(all_tab_pos)
        i = 0
        while i < len(all_tab_pos_for_np):
            j = 0
            while j < len(all_tab_pos_for_np[i]):
                all_tab_pos_for_np[i][j][0] = unidecode.unidecode(all_tab_pos_for_np[i][j][0])
                j += 1

            i += 1

        all_tab_pos_for_work = all_tab_pos_for_np
    else:
        all_tab_pos_for_work = all_tab_pos


    all_kind = {}
    for atb in all_tab_pos_for_work:
        for t in atb:
            if t[1] == kind:
                if t[0] not in all_kind:
                    all_kind[t[0]] = []

    modes_of_np = {}
    for atb in all_tab_pos_for_work:
        for t in atb:
            if t[0] in all_kind:
                all_kind[t[0]].append(t[1])

    for e in all_kind:
        counts = Counter(all_kind[e])
        mode_string, _ = counts.most_common(1)[0]
        if mode_string == kind:
            unigrams[e] = 1

    return unigrams

In [None]:
def remove_duplicates():
    global is_europresse

    n_processes = os.cpu_count()

    # Prétraitement : Extraire les 100 premiers tokens pour chaque document
    # Étape préalable : Diviser chaque document en mots
    split_docs = [doc.split() for doc in documents]

    # Étape principale : Créer une liste de tuples
    tokenized_docs = [(i, doc[:100]) for i, doc in enumerate(split_docs)]

    flattened = [word for sublist in split_docs for word in sublist]

    # Générer toutes les combinaisons sans répétition
    pairs = list(itertools.combinations(tokenized_docs, 2))

    # Divisez les paires pour chaque processus
    n = len(pairs) // n_processes
    sub_pairs = [pairs[i:i+n] for i in range(0, len(pairs), n)]

    # Utilisation d'un Manager pour partager la liste des indices entre les processus
    with Manager() as manager:
        shared_indices = manager.list()

        with Pool(n_processes) as pool:
            pool.map(check_doublons, [(sp, shared_indices) for sp in sub_pairs])

        # Convertissez en set pour supprimer les doublons, puis triez
        indices_to_remove = sorted(set(shared_indices), reverse=True)

        # Supprimez les doublons en fonction des indices recueillis
        for index in indices_to_remove:
            del documents[index]

            if is_europresse:
                del all_soups[index]

In [None]:
def pre_trim_this_ngram(
    wtaan,
    cmdaan):

  i = 0
  while (i < len(cmdaan)
        and (
          #cmdaan[i] == 'det' or
         #    cmdaan[i] == 'other' or
         #    cmdaan[i] == 'prep' or
          #   cmdaan[i] == 'pron' or
             cmdaan[i] == 'sconj' or
             cmdaan[i] == 'poncts' or
             cmdaan[i] == 'coo' or
             ((i+1) < len(cmdaan) and cmdaan[i] == 'adj' and cmdaan[i+1] != 'nc' and cmdaan[i+1] != 'np'))):
    i += 1


  if i < len(cmdaan):
    if i != 0:
      j = i
      new_aan = []
      new_cmdaan = []
      while j < len(cmdaan):
        new_aan = new_aan + [wtaan[j]]
        new_cmdaan = new_cmdaan + [cmdaan[j]]
        j += 1

      return new_aan, new_cmdaan
    else:
      return wtaan, cmdaan
  else:
    return [], []

In [None]:
def post_trim_this_ngram(
    wtaan,
    cmdaan):

  i = len(cmdaan) - 1
  while (i >= 0
        and (cmdaan[i] == 'aux'
              or cmdaan[i] == 'pron'
              or cmdaan[i] == 'coo'
             # or cmdaan[i] == 'poncts'
              or cmdaan[i] == 'adv'
              or cmdaan[i] == 'sconj'
              or cmdaan[i] == 'other'
              or cmdaan[i] == 'det'
              or cmdaan[i] == 'prep'
              or ((i-1) >= 0 and cmdaan[i-1] == 'coo' and cmdaan[i] == 'adj')
              or ((i-1) >= 0 and cmdaan[i-1] == 'prep' and cmdaan[i] == 'adj')
              or ((i-1) >= 0 and cmdaan[i-1] == 'det' and cmdaan[i] == 'adj')
              or ((i-1) >= 0 and (cmdaan[i-1] == 'det' or cmdaan[i-1] == 'prep') and cmdaan[i] == 'num'))):
    i -= 1

  if i > 0:
    if i != (len(cmdaan) - 1):
      j = 0
      new_aan = []
      new_cmdaan = []
      while j <= i:
        new_aan = new_aan + [wtaan[j]]
        new_cmdaan = new_cmdaan + [cmdaan[j]]
        j += 1

      return new_aan, new_cmdaan
    else:
      return wtaan, cmdaan
  else:
    return [], []

In [None]:
def pourcentage_mots_en_commun_petit(texte1, texte2):
    # Convertir chaque texte en un ensemble de mots
    mots_texte1 = set(texte1.split())
    mots_texte2 = set(texte2.split())

    # Identifier le texte le plus court et le texte le plus long
    if len(mots_texte1) < len(mots_texte2):
        mots_texte_court = mots_texte1
        mots_texte_long = mots_texte2
    else:
        mots_texte_court = mots_texte2
        mots_texte_long = mots_texte1

    # Trouver l'intersection des mots entre les deux textes
    mots_communs = mots_texte_court.intersection(mots_texte_long)

    # Calculer le pourcentage des mots du texte le plus court qui sont présents dans le texte le plus long
    pourcentage = (len(mots_communs) / len(mots_texte_court)) * 100

    return pourcentage


In [None]:
def pourcentage_mots_en_commun(texte1, texte2):
    # Convertir chaque texte en un ensemble de mots
    mots_texte1 = set(texte1.split())
    mots_texte2 = set(texte2.split())

    # Trouver l'intersection des mots entre les deux textes
    mots_communs = mots_texte1.intersection(mots_texte2)

    # Calculer le pourcentage de mots en commun
    total_mots = len(mots_texte1) + len(mots_texte2)
    pourcentage = (2 * len(mots_communs) / total_mots) * 100

    return pourcentage

In [None]:
def write_topics_sentences():
    for num_topic in relevant_i_by_topics:
        tab_words_nmf = []
        for _, topic in enumerate(all_nmf_H[num_topic]):
            subtab_words_nmf = {}
            for i in range(len(topic)):
                subtab_words_nmf[tfidf_feature_names[i]] = topic[i]

            tab_words_nmf.append(subtab_words_nmf)

        topic_dicts = []

        i = 0
        for tab in tab_words_nmf:
            topic_dict = {}
            j = 0
            for ngram in all_sentences_array[num_topic][i]:
                tokens = ngram.split()
                total_score = sum(tab.get(token, 0) for token in set(tokens))
                topic_dict[all_sentences_array_original[num_topic][i][j]] = total_score

                j += 1

            topic_dicts.append(topic_dict)

            i += 1

        top_ngrams_per_topic = []

        for _, topic_dict in enumerate(topic_dicts):

            # 1. Grouper les clés par valeur
            value_to_keys = {}
            for key, value in topic_dict.items():
                if value not in value_to_keys:
                    value_to_keys[value] = []
                value_to_keys[value].append(key)

            # 2. Identifier les clés à supprimer dans chaque groupe
            keys_to_remove = set()
            for value, keys in value_to_keys.items():
                if len(keys) > 1:
                    # Trier les clés par longueur
                    sorted_keys = sorted(keys, key=lambda x: len(x.split()), reverse=True)
                    # Ajouter toutes les clés sauf la plus longue à la liste des clés à supprimer
                    keys_to_remove.update(sorted_keys[1:])

            # 3. Supprimer les clés identifiées
            for key in keys_to_remove:
                topic_dict.pop(key, None)

            # Le reste du code original
            sorted_ngrams = sorted(topic_dict.items(), key=lambda x: x[1], reverse=True)
            top_ngrams = sorted_ngrams[:1000]

            top_ngrams_per_topic.append(top_ngrams)


        final_top_ngrams_per_topic = []

        for top_ngrams in top_ngrams_per_topic:
            final_top_ngrams = []

            for ngram, score in top_ngrams:
                if len(final_top_ngrams) < 10:
                    if all(ngram not in other_ngram for other_ngram, _ in top_ngrams if ngram != other_ngram):
                        already_in_approx = False
                        for final_ngram in final_top_ngrams:
                            if pourcentage_mots_en_commun(final_ngram[0], ngram) >= 70 and pourcentage_mots_en_commun_petit(final_ngram[0], ngram) >= 70:
                                already_in_approx = True

                        if not already_in_approx:
                            good_very_candidate = re.sub(r"(['’`]) ", r"\1", ngram).replace(' ,', ',').replace(' .', '.')
                            count_apos = good_very_candidate.count('"')

                            if count_apos == 1:
                                good_very_candidate = good_very_candidate.replace('"', '', 1)
                                good_very_candidate = re.sub(r'\s+', ' ', good_very_candidate)

                            final_top_ngrams.append((good_very_candidate, score))

            final_top_ngrams_per_topic.append(final_top_ngrams)

        write_sentences_results(topic_model_sentences_output_name + '_' + str(num_topic) + '.csv',
                                final_top_ngrams_per_topic)

In [None]:
def write_topics_unigrams():
    global tf_idf_feature_names
    global all_nmf_H
    global relevant_i_by_topics

    for num_topic in relevant_i_by_topics:
        write_unigrams_results(unigrams_nb_by_topic,
                        tfidf_feature_names,
                        topic_model_unigrams_output_name + '_' + str(num_topic) + '.csv',
                        all_nmf_H[num_topic])

In [None]:
def sentences_extraction():
    global all_sentences_array
    global all_sentences_array_original
    global sentences_norms
    global sentences_lemmas

    for num_topic in tqdm(relevant_i_by_topics, desc="TOPICS CONFIGURATIONS PROCESSED", bar_format="{l_bar}{bar:10}{r_bar}"):
        all_sentences_array[num_topic] = {}
        all_sentences_array_original[num_topic] = {}

        for topic in relevant_i_by_topics[num_topic]:
            all_sentences_array[num_topic][topic] = []
            all_sentences_array_original[num_topic][topic] = []

            i = 0
            while i < len(sentences_lemmas):
                if i in relevant_i_by_topics[num_topic][topic]:
                    all_sentences_array[num_topic][topic].extend(sentences_lemmas[i])
                    all_sentences_array_original[num_topic][topic].extend(sentences_norms[i])

                i += 1

In [None]:
def write_documents_infos():
    global every_topics_and_scores_by_document
    global is_europresse
    global columns_dicts
    global all_tab_pos

    for num_topic in tqdm(relevant_i_by_topics, bar_format="{l_bar}{bar:10}{r_bar}"):
        rows = []
        rows_lemmas = []

        if is_europresse:
            title_row = 'title;authors;raw_authors;nb_characters;journal;date;'
            for i in range(num_topic):
                title_row += 'topic_' + str(i) + ';' + 'score_' + str(i) + ';'

            title_row = title_row[:-1]
            rows.append(title_row)

            title_row_lemmas = 'lemma;postag_lemma;count_lemma;' + title_row
            rows_lemmas.append(title_row_lemmas)
        else:
            title_row = ''
            for key_title in columns_dicts:
                title_row += key_title + ';'

            title_row += 'nb_characters;'

            for i in range(num_topic):
                title_row += 'topic_' + str(i) + ';' + 'score_' + str(i) + ';'

            title_row = title_row[:-1]
            rows.append(title_row)

            title_row_lemmas = 'lemma;postag_lemma;count_lemma;' + title_row
            rows_lemmas.append(title_row_lemmas)

        i = 0
        if is_europresse:
            while i < len(all_soups):
                if i < len(every_topics_and_scores_by_document[num_topic]):
                    rows.append(write_info_europresse(every_topics_and_scores_by_document[num_topic][i],
                                                    all_soups[i],
                                                    documents_lemmatized[i]))

                    lemmas_dict = {}
                    for unigramme, postag in all_tab_pos[i]:
                        if postag == 'np' or postag == 'nc' or postag == 'v':
                            unigramme = unigramme.replace('l&apos;', '')
                            unigramme = unigramme.replace(';', '')

                            if (unigramme, postag) in lemmas_dict:
                                lemmas_dict[(unigramme, postag)] += 1
                            else:
                                lemmas_dict[(unigramme, postag)] = 1

                    rows_lemmas.extend(write_info_europresse_lemmas(lemmas_dict,
                                                                    every_topics_and_scores_by_document[num_topic][i],
                                                                    all_soups[i],
                                                                    documents_lemmatized[i]))
                i += 1

        else:
            while i < len(columns_dicts['date']):
                rows.append(write_info_another(every_topics_and_scores_by_document[num_topic][i],
                                                columns_dicts,
                                                i,
                                                documents_lemmatized[i]))

                lemmas_dict = {}
                for unigramme, postag in all_tab_pos[i]:
                    if postag == 'np' or postag == 'nc' or postag == 'v':
                        unigramme = unigramme.replace('l&apos;', '')
                        unigramme = unigramme.replace(';', '')

                        if (unigramme, postag) in lemmas_dict:
                            lemmas_dict[(unigramme, postag)] += 1
                        else:
                            lemmas_dict[(unigramme, postag)] = 1

                rows_lemmas.extend(write_info_another_lemmas(lemmas_dict,
                                                                every_topics_and_scores_by_document[num_topic][i],
                                                                columns_dicts,
                                                                i,
                                                                documents_lemmatized[i]))

                i += 1

        with open(documents_info_name + '_' + str(num_topic) + '.csv', "w", encoding='utf-8') as file_object:
            for row in rows:
                file_object.write(row.replace('\n', '') + '\n')

        with open(documents_lemmas_info_name + '_' + str(num_topic) + '.csv', "w", encoding='utf-8') as file_object:
            for row in rows_lemmas:
                file_object.write(row.replace('\n', '') + '\n')

In [None]:
def write_all_pickles():
    with open(f"{results_path}documents_lemmatized.pkl", 'wb') as f:
        pickle.dump(documents_lemmatized, f)

    with open(f"{results_path}all_tab_pos.pkl", 'wb') as f:
        pickle.dump(all_tab_pos, f)

    with open(f"{results_path}sentences_norms.pkl", 'wb') as f:
        pickle.dump(sentences_norms, f)

    with open(f"{results_path}sentences_lemmas.pkl", 'wb') as f:
        pickle.dump(sentences_lemmas, f)

In [None]:
def read_all_pickles():
    with open(f"{results_path}documents_lemmatized.pkl", 'rb') as f:
        documents_lemmatized = pickle.load(f)

    with open(f"{results_path}all_tab_pos.pkl", 'rb') as f:
        all_tab_pos = pickle.load(f)

    with open(f"{results_path}sentences_norms.pkl", 'rb') as f:
        sentences_norms = pickle.load(f)

    with open(f"{results_path}sentences_lemmas.pkl", 'rb') as f:
        sentences_lemmas = pickle.load(f)

    return documents_lemmatized, all_tab_pos, sentences_norms, sentences_lemmas