# Libraries

In [3]:
import cv2
import pytesseract
from pytesseract import Output
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import Levenshtein

# constants

In [6]:
PDF_PATH = '../data/pv.pdf'
PDF_PATH = '../data/extrait_PV.pdf'
PROCESSED_DATA_PATH = '../data/processed/'
TITLE = "OBJET"
#DPI, WIDTH, HEIGHT = 72, 2480, 3508
#DPI, WIDTH, HEIGHT = 72, 1240, 1754
DPI, WIDTH, HEIGHT = 300, 2481, 3507
EXTRA_HEIGHT = 100
SIMILARITY_THRESHOLD = 0.9
LINE_JOIN_THRESHOLD = 10

In [7]:
TITLE_LIST=["OBJET",
            "Maitre d'ouvrage",
            "Date d'ouverture des plis",
            "Journaux de publication de l'avis d'AO",
            "publication de l'Avis d'AO",
            "ELECTRONIQUE",
            "LISTE DES CONCURRENTS EVINCES A TISSUE DE L'EXAMEN DES DOSSTERS ADMINISTRATIFS",
            "LISTE DES CONCURRENTS ADMISSIBLES SANS RESERVE",
            "LISTE DES CONCURRENTS ADMISSIBLES AVEC RESERVE",
            "MONTANT DES ACTES D'ENGAGEMENTS DES CONCURRENTS RETENUS",
            "CONCURRENT INVITE A DEPOSER LE COMPLEMENT DU DOSSIER ADMINISTRATIF",
            "JUSTIFICATION DU CHOIX"
          ]

# identify sections by similarity

### Src

In [4]:
# Function to convert PDF to images
def pdf_to_images(pdf_path, dpi=DPI):
    images = convert_from_path(pdf_path, dpi=dpi)
    return images

# Function to preprocess image
def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return binary

# Function to calculate text similarity using TF-IDF
def calculate_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0, 1]


# Function to find the title using text similarity
def find_title(image, title, threshold=SIMILARITY_THRESHOLD):
    data = pytesseract.image_to_data(image, output_type=Output.DICT)
    n_boxes = len(data['level'])
    lines = {}

    # Group words into lines
    for i in range(n_boxes):
        text = data['text'][i].strip()
        if text:
            top = data['top'][i]
            if top in lines:
                lines[top].append(text)
            else:
                lines[top] = [text]
    print(lines)
    title_y = None
    max_similarity = 0

    # Calculate similarity for each line
    for top, words in lines.items():
        text_line = ' '.join(words)
        similarity = calculate_similarity(title.lower(), text_line.lower())

        if similarity > max_similarity and similarity >= threshold:
            max_similarity = similarity
            title_y = top

    return title_y


# Function to crop the section containing the title
def crop_section(image, coordinates, extra_height=EXTRA_HEIGHT):
    x, y, w, h = coordinates
    #cropped_image = image[y:y+h+extra_height, x:x+w]
    cropped_image = image[y:y+h+extra_height, 0:WIDTH]
    return cropped_image

# Function to save and display cropped images
def save_and_display_image(image, page_number, title):
    # Convert OpenCV image to PIL format for saving
    cropped_image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    output_path = f'cropped_page_{page_number}_title_{title}.png'
    cropped_image_pil.save(output_path)
    cropped_image_pil.show()
    return output_path

# join lines before processing

### Src

image processing functions

In [5]:
# Function to convert PDF to images
def pdf_to_images(pdf_path, dpi):
    images = convert_from_path(pdf_path, dpi=dpi)
    return images

# Function to preprocess image
def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return binary

# Function to save and display cropped images
def save_and_display_image(image, page_number, title):
    # Convert OpenCV image to PIL format for saving
    cropped_image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    output_path = f'{PROCESSED_DATA_PATH}{title}_page_{page_number}.png'
    cropped_image_pil.save(output_path)
    cropped_image_pil.show()
    return output_path

# Function to crop a section
def crop_section(image, y_up,y_down, extra_height=EXTRA_HEIGHT):
    cropped_image = image[y_up:y_down, 0:WIDTH]
    return cropped_image


layout zoning functions

In [6]:
# Function to join close lines
def join_close_lines(lines, threshold):
    sorted_lines = sorted(lines.items())  # Sort lines by their top coordinate
    joined_lines = {}
    current_top, current_words = sorted_lines[0]

    for top, words in sorted_lines[1:]:
        if top - current_top <= threshold:
            current_words.extend(words)
        else:
            joined_lines[current_top] = current_words
            current_top, current_words = top, words

    joined_lines[current_top] = current_words
    return joined_lines
# Function to find the title using keywords
def find_title(image, keyword):
    data = pytesseract.image_to_data(image, output_type=Output.DICT)
    n_boxes = len(data['level'])
    lines = {}

    # Group words into lines
    for i in range(n_boxes):
        text = data['text'][i].strip()
        if text:
            top = data['top'][i]
            if top in lines:
                lines[top].append(text)
            else:
                lines[top] = [text]

    # Join close lines
    lines = join_close_lines(lines, threshold = LINE_JOIN_THRESHOLD)

    title_y = None
    # Check each line for keyword
    for top, words in lines.items():
        text_line = ' '.join(words).lower()
        #if all(keyword in text_line for keyword in keywords):
        if keyword in text_line:
            print(f"found title '{keyword}' in line '{text_line}'")
            title_y = top
            break
    return title_y

def shift_dict_values(y_ups,height=HEIGHT):
    sorted_items = sorted(y_ups.items(), key=lambda item: item[1])
    keys, values = zip(*sorted_items)
    # Shift the values up
    shifted_values = list(values[1:]) + [height]
    y_downs = dict(zip(keys, shifted_values))
    return y_downs

ocr functions

In [9]:
# Function to perform OCR on the cropped section (optional)
def perform_ocr(image):
    text = pytesseract.image_to_string(image)
    return text

### Main

In [21]:
# Main function to run the process
def main(pdf_path, keywords, dpi=DPI):
    # Convert PDF to images
    images = pdf_to_images(pdf_path, dpi)
    y_ups={}
    for keyword in keywords:
        print(f"searching for keyword {keyword}")
        # Iterate over each page
        for page_number, image in enumerate(images, start=1):
            # Convert PIL image to OpenCV format
            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            preprocessed_image = preprocess_image(image_cv)
            # Perform OCR and get data with bounding boxes
            data = pytesseract.image_to_data(preprocessed_image, output_type=Output.DICT)
            y_start = find_title(preprocessed_image, keyword=keyword)
            print(f'ystart for {keyword} is {y_start}')
            if y_start is not None:
                y_ups[keyword] = y_start
                # Crop the section containing the title
                #cropped_image = crop_section(preprocessed_image, y_start, data)
                #save_and_display_image(cropped_image, page_number, keywords)
                #text = perform_ocr(cropped_image)
            #else:
                #print(f"Keywords '{keywords}' not found on Page {page_number}.")
    y_downs = shift_dict_values(y_ups=y_ups)
    #cropped_images={}
    texts = {}
    for key in y_ups.keys():
        y_up=y_ups[key]
        y_down=y_downs[key]
        cropped_image = crop_section(preprocessed_image, y_up, y_down)
        #cropped_images[key]=cropped_image
        text = perform_ocr(cropped_image)
        texts[key] = text
        print(f"text of {key} is {text}")
        save_and_display_image(cropped_image, page_number, key)
    print("y_ups: ", y_ups)
    print("y_downs: ", y_downs)
    return texts

#keywords = [["objet"],["journal"],["date"]]#,["liste", "concurrents"]]
#keywords = [("objet"), ("liste"),("date"),("lieu"),("maitre"), ("journal")]
keywords = [("objet"),("montant"),("retenu"), ("liste"),("date"),("lieu"),("maitre"), ("journal")]
texts=main(PDF_PATH, keywords)


searching for keyword objet
found title 'objet' in line 'objet: d’alimentation potable la population des douars tifrmite ait ighir ait el hakem et travaux'
ystart for objet is 459
searching for keyword montant
found title 'montant' in line 'des d'engagement des montant actes concurrents:'
ystart for montant is 1730
searching for keyword retenu
found title 'retenu' in line 'concurrent retenu:'
ystart for retenu is 2444
searching for keyword liste
found title 'liste' in line '-portail des marches publics www.marchespublics.gov.ma du 05/05/2024liste des'
ystart for liste is 1060
searching for keyword date
found title 'date' in line 'd'ouverture des plis: 05/06/2024 a 10h date :00mn.'
ystart for date is 626
searching for keyword lieu
found title 'lieu' in line 'lieu d'ouverture des plis: salle des réunions de la province de tiznit..'
ystart for lieu is 682
searching for keyword maitre
found title 'maitre' in line 'd'ouvrage: président du conseil communal de la idagougmar. maitre ct'
ystart

### storing in a pandas data frame

In [53]:
import pandas as pd

df=pd.DataFrame(columns = keywords+["not_found_data"])
print(df)

Empty DataFrame
Columns: [objet, montant, retenu, liste, date, lieu, maitre, journal, not_found_data]
Index: []


In [54]:
df = pd.concat([df, pd.DataFrame([texts])], ignore_index=True)

In [55]:
df.head()

Unnamed: 0,objet,montant,retenu,liste,date,lieu,maitre,journal,not_found_data
0,Objet: travaux dalimentation en eau potable po...,Montant des actes d'engagement des concurrents...,Concurrent retenu:\n\n \n\nConcurrent Montant ...,-Portail des marches publics www.marcnespublic...,Date d'ouverture des plis: 03/06/2024 a 10h :‘...,Lieu d’ouverture des plis: Salle des reunions ...,Maitre d'ouvrage: Président du Conseil Communa...,Publie dans le journal a alifusion nationale a...,


In [56]:
df.to_csv(PROCESSED_DATA_PATH+'df.csv')

# use embedding instead

### Src

image processing functions

In [8]:
# Function to convert PDF to images
def pdf_to_images(pdf_path, dpi):
    images = convert_from_path(pdf_path, dpi=dpi)
    return images

# Function to preprocess image
def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return binary

# Function to save and display cropped images
def save_and_display_image(image, page_number, title):
    # Convert OpenCV image to PIL format for saving
    cropped_image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    output_path = f'{PROCESSED_DATA_PATH}{title}_page_{page_number}.png'
    cropped_image_pil.save(output_path)
    cropped_image_pil.show()
    return output_path

# Function to crop a section
def crop_section(image, y_up,y_down, extra_height=EXTRA_HEIGHT):
    cropped_image = image[y_up:y_down, 0:WIDTH]
    return cropped_image

layout zoning functions

In [9]:
# Function to join close lines
def join_close_lines(lines, threshold):
    sorted_lines = sorted(lines.items())  # Sort lines by their top coordinate
    joined_lines = {}
    current_top, current_words = sorted_lines[0]

    for top, words in sorted_lines[1:]:
        if top - current_top <= threshold:
            current_words.extend(words)
        else:
            joined_lines[current_top] = current_words
            current_top, current_words = top, words

    joined_lines[current_top] = current_words
    return joined_lines
# Function to find the title using keywords
def find_title(image, keyword):
    data = pytesseract.image_to_data(image, output_type=Output.DICT)
    n_boxes = len(data['level'])
    lines = {}

    # Group words into lines
    for i in range(n_boxes):
        text = data['text'][i].strip()
        if text:
            top = data['top'][i]
            if top in lines:
                lines[top].append(text)
            else:
                lines[top] = [text]

    # Join close lines
    lines = join_close_lines(lines, threshold = LINE_JOIN_THRESHOLD)

    title_y = None
    # Check each line for keyword
    for top, words in lines.items():
        text_line = ' '.join(words).lower()
        #if all(keyword in text_line for keyword in keywords):
        if keyword in text_line:
            print(f"found title '{keyword}' in line '{text_line}'")
            title_y = top
            break
    return title_y

def shift_dict_values(y_ups,height=HEIGHT):
    sorted_items = sorted(y_ups.items(), key=lambda item: item[1])
    keys, values = zip(*sorted_items)
    # Shift the values up
    shifted_values = list(values[1:]) + [height]
    y_downs = dict(zip(keys, shifted_values))
    return y_downs

ocr functions

In [10]:
# Function to perform OCR on the cropped section (optional)
def perform_ocr(image):
    text = pytesseract.image_to_string(image)
    return text

### Main

In [11]:
def main(pdf_path, dpi=DPI):
    # Convert PDF to images
    images = pdf_to_images(pdf_path, dpi)
    # Iterate over each page
    for page_number, image in enumerate(images, start=1):
        # Convert PIL image to OpenCV format
        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        preprocessed_image = preprocess_image(image_cv)
        # Perform OCR and get data with bounding boxes
        text = perform_ocr(preprocessed_image)
    return text

#keywords = [["objet"],["journal"],["date"]]#,["liste", "concurrents"]]
#keywords = [("objet"), ("liste"),("date"),("lieu"),("maitre"), ("journal")]
#keywords = [("objet"),("montant"),("retenu"), ("liste"),("date"),("lieu"),("maitre"), ("journal")]
text=main(PDF_PATH)


In [12]:
print(text)

Royaume du Maroc
CT Idagougmar

Extrait du procés-verbal de la séance de L'appel d'offres n° 03/2024

Objet: travaux d’alimentation en eau potable pour la population des douars Tifrmite ; Ait Ighir ; Ait El Hakem Et
Tighighite de La CT Ida Gougmar

Maitre d'ouvrage: Président du Conseil Communal de la CT Idagougmar.

Date d'ouverture des plis: 05/06/2024 a 10h :00mn.

Lieu d'ouverture des plis: Salle des réunions de la Province de Tiznit..

Publié dans le journal a diffusion nationale a savoir:

- 4eY! alley : en date du 10.05.2024 sous le n° : 12 735 ;
- liberation : en date du 10.05.2024 sous le n° : 10 218 ;

-Portail des marches publics www.marchespublics.gov.ma du 05/05/2024Liste des
concurrents ayant déposé leurs plis:

Groupement solidaire Sté VIV AQUA SARL et Sté BELID DE CONSTRUCTION SARL
STE KHALIFID SARL

Liste des concurrents écartés 4 l'issue de l'examen des dossiers administratifs et techniques: Néant.
Liste des concurrents admis sans réserve:

Groupement solidaire Sté VI

In [13]:
import re

def normalize_whitespace(text):
    # Replace multiple whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    text = text.strip()
    return text


In [14]:
text_processed = normalize_whitespace(text)
print(text)

Royaume du Maroc
CT Idagougmar

Extrait du procés-verbal de la séance de L'appel d'offres n° 03/2024

Objet: travaux d’alimentation en eau potable pour la population des douars Tifrmite ; Ait Ighir ; Ait El Hakem Et
Tighighite de La CT Ida Gougmar

Maitre d'ouvrage: Président du Conseil Communal de la CT Idagougmar.

Date d'ouverture des plis: 05/06/2024 a 10h :00mn.

Lieu d'ouverture des plis: Salle des réunions de la Province de Tiznit..

Publié dans le journal a diffusion nationale a savoir:

- 4eY! alley : en date du 10.05.2024 sous le n° : 12 735 ;
- liberation : en date du 10.05.2024 sous le n° : 10 218 ;

-Portail des marches publics www.marchespublics.gov.ma du 05/05/2024Liste des
concurrents ayant déposé leurs plis:

Groupement solidaire Sté VIV AQUA SARL et Sté BELID DE CONSTRUCTION SARL
STE KHALIFID SARL

Liste des concurrents écartés 4 l'issue de l'examen des dossiers administratifs et techniques: Néant.
Liste des concurrents admis sans réserve:

Groupement solidaire Sté VI

embedding

In [15]:
from transformers import BertModel, BertTokenizer
#import torch

# Load pre-trained BERT model and tokenizer for French
model_name = "camembert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Example to get embeddings for a section
section_text = "some section text"
embeddings = get_embeddings(section_text)


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'CamembertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType