In [2]:
import os
from pypdf import PdfReader, PdfWriter
from pypdf.generic import NameObject, TextStringObject
import re
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import SnowballStemmer
import json

In [4]:
path = "../data/documents"
path_index_docs = "../indexes/doc_indexes"

# Extraire tous les paths des fichiers de recette

In [5]:
path_filenames_list = []
for dirname,subfolder,filename in os.walk(path) :
    for file in filename :
        path_file = os.path.join(dirname,file)
        path_filenames_list.append(path_file)

path_filenames_list

['../data/documents\\Boeuf Bourguignon.pdf',
 '../data/documents\\Bouillabaisse.pdf',
 '../data/documents\\Cassoulet.pdf',
 '../data/documents\\Clafoutis aux Cerises.pdf',
 '../data/documents\\Coq au Vin.pdf',
 '../data/documents\\Croque.pdf',
 '../data/documents\\Crêpes Suzette.pdf',
 '../data/documents\\Galette des Rois.pdf',
 '../data/documents\\Gratin Dauphinois.pdf',
 '../data/documents\\Mousse au Chocolat.pdf',
 '../data/documents\\Poulet Basquaise.pdf',
 '../data/documents\\Quenelles de Lyon.pdf',
 '../data/documents\\Quiche Lorraine.pdf',
 '../data/documents\\Ratatouille.pdf',
 '../data/documents\\Salade Niçoise.pdf',
 '../data/documents\\Soupe à oignon.pdf',
 '../data/documents\\Tarte Tatin.pdf']

# ajouter les indices des fichiers dans metadonnees

In [6]:
i = 0
for file_path in path_filenames_list :
    pdf_reader = PdfReader(file_path)
    metadata = pdf_reader.metadata
    
    key = NameObject("/indice_doc")
    value = TextStringObject(str(i))
    metadata[key] = value
    print(metadata)
    
    with open(file_path, "wb") as file :
        pdf_writer = PdfWriter()
        pdf_writer.add_metadata(metadata)
        for page in pdf_reader.pages :
            pdf_writer.add_page(page)
        pdf_writer.write(file)
    
    
    i += 1

{'/Producer': 'Microsoft® Word 2019', '/Author': 'yassine1 yassine2', '/Creator': 'Microsoft® Word 2019', '/CreationDate': "D:20241207103932+01'00'", '/ModDate': "D:20241207103932+01'00'", '/indice_doc': '0'}
{'/Producer': 'Microsoft® Word 2019', '/Author': 'yassine1 yassine2', '/Creator': 'Microsoft® Word 2019', '/CreationDate': "D:20241207103510+01'00'", '/ModDate': "D:20241207103510+01'00'", '/indice_doc': '1'}
{'/Producer': 'Microsoft® Word 2019', '/Author': 'yassine1 yassine2', '/Creator': 'Microsoft® Word 2019', '/CreationDate': "D:20241207103622+01'00'", '/ModDate': "D:20241207103622+01'00'", '/indice_doc': '2'}
{'/Producer': 'Microsoft® Word 2019', '/Author': 'yassine1 yassine2', '/Creator': 'Microsoft® Word 2019', '/CreationDate': "D:20241207104215+01'00'", '/ModDate': "D:20241207104215+01'00'", '/indice_doc': '3'}
{'/Producer': 'Microsoft® Word 2019', '/Author': 'yassine1 yassine2', '/Creator': 'Microsoft® Word 2019', '/CreationDate': "D:20241207103532+01'00'", '/ModDate': "D

# dictionnaire de mots

In [7]:
stemmer = SnowballStemmer("french")
word_dict = defaultdict(dict)

for file in path_filenames_list :
    reader = PdfReader(file)
    metadata = reader.metadata
    
    #extraire l indice du document
    indice_doc = metadata["/indice_doc"]
    
    page = reader.pages[0]
    content = page.extract_text()

    #mettre tout le texte en minuscule
    content = content.lower()
    
    #retenir que les lettres alphabetiques en francais et supprimer les chiffres et les espaces supplementaires
    clean_content = re.sub(r"[^\w\sàâäéèêëîïôöùûüç]",' ',content)
    clean_content = re.sub(r"\d",' ',clean_content)
    clean_content = re.sub(r"\s+",' ', clean_content)
    clean_content = re.sub(" o ",' ', clean_content)
    
    #separer le texte pour obtenir une liste de mots separes
    words = clean_content.split()

    #supprimer les stopwords francais comme : de, le, la, un ....
    clean_words = list(filter(lambda token: token not in stopwords.words('french'),words))

    #radicalisation des mots 
    clean_words = [stemmer.stem(word) for word in clean_words ]

    # Construire le dictionnaire de mots avec fréquence et indice_doc
    word_count = defaultdict(int)
    for word in clean_words:
        word_count[word] += 1
    
    for word, freq in word_count.items():
        prefix = word[:2]  # Les deux premiers caractères du mot
        if word not in word_dict[prefix]:
            word_dict[prefix][word] = []
        word_dict[prefix][word].append([int(indice_doc), freq])
    
word_dict

defaultdict(dict,
            {'bo': {'boeuf': [[0, 1]],
              'bourguignon': [[0, 1]],
              'bouquet': [[0, 2]],
              'bouillabaiss': [[1, 1]],
              'bouillon': [[1, 2], [15, 2]],
              'bord': [[7, 1]],
              'bouill': [[11, 1]],
              'bol': [[12, 1], [15, 1]],
              'boît': [[14, 1]]},
             'in': {'ingrédient': [[0, 1],
               [1, 1],
               [2, 1],
               [3, 1],
               [4, 1],
               [5, 1],
               [6, 1],
               [7, 1],
               [8, 1],
               [9, 1],
               [10, 1],
               [11, 1],
               [12, 1],
               [13, 1],
               [14, 2],
               [15, 1],
               [16, 1]],
              'instruct': [[0, 1],
               [1, 1],
               [2, 1],
               [3, 1],
               [4, 1],
               [5, 1],
               [6, 1],
               [7, 1],
               [8, 1],
    

In [23]:
# Écriture des fichiers JSON par préfixe
for prefix, words_data in word_dict.items():
    output_file = os.path.join(path_index_docs, f"{prefix}.json")
    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(words_data, json_file, ensure_ascii=False, indent=4)

print("Les fichiers JSON ont été générés avec succès.")

Les fichiers JSON ont été générés avec succès.
