In [4]:
import os
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from tqdm import tqdm
import re
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [5]:
# Define paths
from eurocc_v1.paths import DATA_DIR, OUT_DIR
src_folder = DATA_DIR / "raw"
res_folder = OUT_DIR / "pdf_to_text"
res_folder.mkdir(exist_ok=True)

In [6]:
def parse_single_doc(file_path):
    text = []
    reader = PdfReader(file_path)

    for page in tqdm(reader.pages[:]):

        # Extract text from the page
        curr_text = page.extract_text()
        if curr_text is None:
            raise ValueError("No text could be extracted from the PDF page.")

        text.append(curr_text)
        text.append("\n\n====================================END PAGE====================================\n\n")

    return "".join(text)

# Function to remove header from text
def remove_header(text):
    header = r'PROCEDURA\s+PRO\d+\s+-P-IT\s+REV\.\s+\d+\s+PRO\d\s+\d+-P-IT\s+REV\.\s+\d+\s+SELEZIONE,\s+AUTORIZZAZIONE\s+E\s+QUALIFICA\s+DEI\s+FORNITORI\s+'
    _RE_HEADER = re.compile(header, re.IGNORECASE)
    text = _RE_HEADER.sub("", text).strip()
    return text

# Function to remove footer from text
def remove_footer(text):
    footer = r'Template:\s+QUA\d+\s+-T-CO\s+it\s+rev\d+\s+©\s+Copyright\s+Selex\s+ES\s+S\.p\.A\.\s+\d+\s+\d+\s+–\s+Tutti\s+i\s+diritti\s+riservati\s+Pag\.\s+\d+\s+di\s+\d+'
    _RE_FOOTER = re.compile(footer, re.IGNORECASE)
    text = _RE_FOOTER.sub("", text).strip()
    return text

# Function to remove empty lines from text
def remove_empty_lines(text):
    _RE_EMPTY_LINES = re.compile(r"^(?:[\t ]*(?:\r?\n|\r))+")
    text = _RE_EMPTY_LINES.sub(" ", text).strip()
    return text


In [7]:
def clean_text_total():
    num_errors = 0

    filenames = sorted(os.listdir(src_folder))
    print(f"Number of documents to process: {len(filenames)}")

    for ind, filename in enumerate(filenames):
        print(f"Processing PDF #{ind + 1}: {filename}")

        try:
            # Construct the full file path
            file_path = os.path.join(src_folder, filename)
            if not os.path.isfile(file_path):
                raise FileNotFoundError(f"The file {file_path} does not exist.")

            # Extract text from PDF
            extracted_text = parse_single_doc(file_path)

            # Clean the extracted text
            cleaned_text = remove_header(extracted_text)
            cleaned_text = remove_footer(cleaned_text)
            cleaned_text = remove_empty_lines(cleaned_text)

            # Write the cleaned text to a new file in OUT_DIR
            output_file_path = os.path.join(res_folder, f"RES_{filename[:12]}.txt")
            with open(output_file_path, "w") as out_file:
                out_file.write(cleaned_text)

            print("Extraction, cleaning, and saving completed successfully!")

        except FileNotFoundError as fnf_error:
            print(f"FileNotFoundError: {fnf_error}")
            num_errors += 1
        except PdfReadError as pdf_error:
            print(f"PdfReadError: Error reading PDF file: {pdf_error}")
            num_errors += 1
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            num_errors += 1

    print(f"Total errors: {num_errors}")
    return cleaned_text


cleaned_text = clean_text_total()


Number of documents to process: 3
Processing PDF #1: PRO002-P-IT rev.01 Selezione  Autorizzazione e Qualifica dei Fornitori cg.pdf


  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [00:02<00:00, 19.32it/s]


Extraction, cleaning, and saving completed successfully!
Processing PDF #2: PRO011-P-IT - rev00 -Gestione Procurement delle Richieste d'Acquisto- finale cg.pdf


100%|██████████| 43/43 [00:02<00:00, 18.80it/s]


Extraction, cleaning, and saving completed successfully!
Processing PDF #3: PRO012-P-IT rev.01 - Emissione e Gestione Ordini d'Acquisto cg.pdf


100%|██████████| 76/76 [00:03<00:00, 20.62it/s]

Extraction, cleaning, and saving completed successfully!
Total errors: 0





In [9]:
class TextChunker:
    def __init__(self, chunk_size=200, size_overlap=0, separator=' '):

        self.chunk_size = chunk_size
        self.size_overlap = size_overlap
        self.separator = separator
        self.text_splitter = CharacterTextSplitter(separator=self.separator, chunk_size=self.chunk_size, chunk_overlap=self.size_overlap)

    def chunk_text(self, text, mode='fixed_size'):

        chunks = self.text_splitter.split_text(text)


        return chunks

    def save_chunks_to_files(self, chunks, output_dir):

        os.makedirs(output_dir, exist_ok=True)

        for i, chunk in enumerate(chunks):
            output_file_path = os.path.join(output_dir, f"chunk_{i+1}.txt")
            with open(output_file_path, "w") as f:
                f.write(chunk)
            print(f"Chunk {i+1} saved to {output_file_path}")


chunker = TextChunker()
chunks = chunker.chunk_text(cleaned_text)
#chunker.save_chunks_to_files(chunks, OUT_DIR)

print(chunks)




Chunk 1 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_1.txt
Chunk 2 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_2.txt
Chunk 3 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_3.txt
Chunk 4 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_4.txt
Chunk 5 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_5.txt
Chunk 6 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_6.txt
Chunk 7 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_7.txt
Chunk 8 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_8.txt
Chunk 9 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_9.txt
Chunk 10 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_10.txt
Chunk 11 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_11.txt
Chunk 12 saved to /leonardo_work/PhDLR_prod/llm_eucc/eurocc/output/chunk_12.txt
Chunk 13 saved to /leonardo_work/PhDLR_prod/llm_eucc/euroc

In [10]:
for i, chunk in enumerate(chunks):
  print("------ new chunk ---------")
  print(chunk)
  print()

------ new chunk ---------
Le informazioni contenute nel presente documento sono di proprietà di Selex ES S.p.A. e di Selex ES Ltd. e non possono, 
al pari di tale documento, essere riprodotte, utilizzate o divulgate in tutto o

------ new chunk ---------
in parte a terzi senza preventiva autorizzazi one scritt a di 
Selex ES S.p.A. e di Selex ES Ltd. 
Il documento è disponibile nell’Intranet Aziendale/BMS di Selex ES. Le copie, sia in formato

------ new chunk ---------
elettronico che cartaceo dovrann o essere 
verificate, prima dell’utilizzo, con la versione vigente disponibile su Intranet. In caso di eventuali discrepanze tra la versione inglese e la 
versione

------ new chunk ---------
italiana, prevale la versione inglese. 
© Copyright Selex ES S.p.A. and Selex ES Ltd 2014 - Tutti i diritti riservati 
 
IDENTIFICATIVO : PRO012 -P-IT rev. 01 
DATA: 30/12/2015 
TIPO DOCUMENTO :

------ new chunk ---------
PROCESSO 
APPLICAZIONE : Selex ES S.p.A. 
 
 
 
 
 
 
EMISSIONE E GESTIONE D

In [11]:
class TextEmbedder:
    def __init__(self, encoder_name="BAAI/bge-m3"):

        self.model = SentenceTransformer(encoder_name)

    def generate_embeddings(self, chunks):

        embeddings = self.model.encode(chunks)
        return embeddings


embedder = TextEmbedder()
embeddings = embedder.generate_embeddings(chunks)




: 