# Project Memory

## Environment elements

In [7]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from pathlib import Path

# Load path from the environment variable
env_ih1 = os.getenv("ENV_IH1")

dotenv_path = Path(env_ih1)
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY= os.getenv('PINECONE_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
STEAMSHIP_API_KEY = os.getenv('STEAMSHIP_API_KEY')
LANGSMITH_API_KEY = os.getenv('LANGSMITH_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
GEMINI_KEY = os.getenv('GEMINI_KEY')

os.environ['PATH'] += os.pathsep + '/usr/bin'

## Libraries

In [8]:
# # This notebook requires Tesseract for OCR
%pip install --upgrade openai

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Gemini Json version 

In [9]:
import os
import pandas as pd
import json
from PIL import Image
import google.generativeai as genai
from datetime import datetime
import cv2
import re

# Replace with your actual API key
genai.configure(api_key=GEMINI_KEY)
model = genai.GenerativeModel("gemini-1.5-flash-latest")

def get_image_paths_from_folder(folder_path, extensions=('.jpg', '.jpeg', '.png')):
    return [
        os.path.join(folder_path, file)
        for file in os.listdir(folder_path)
        if file.lower().endswith(extensions)
    ]

def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    return thresh

def extract_page_region(image_path, location='top'):
    image = Image.open(image_path)
    width, height = image.size
    if location == 'top':
        return image.crop((0, 0, width, int(height * 0.5)))  # Top 10%
    elif location == 'bottom':
        return image.crop((0, int(height * 0.5), width, height))  # Bottom 10%
    return None

def extract_text_and_page_number(image_path, page_location='top'):
    try:
        # Extract full text from the entire image
        image = Image.open(image_path)
        prompt_full = f"Extract the full text from the provided image."
        generation_config = genai.types.GenerationConfig(
            temperature=0.1,
            max_output_tokens=1024
        )
        response_full = model.generate_content([prompt_full, image], generation_config=generation_config)
        full_text = response_full.text

        # Extract page number from the specified region
        page_number = None
        if page_location in ['top', 'bottom']:
            region = extract_page_region(image_path, location=page_location)
            if region:
                prompt_page = f"Identify the page number from this image region."
                response_page = model.generate_content([prompt_page, region], generation_config=generation_config)
                page_text = response_page.text

                # Use regex to find the page number
                page_number_match = re.search(r'\b\d+\b', page_text)
                page_number = int(page_number_match.group()) if page_number_match else None

        return full_text, page_number
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return "", None

def process_images(image_paths, page_location='top'):
    pages = []
    for path in image_paths:
        text, page_number = extract_text_and_page_number(path, page_location=page_location)

        # Debugging: Print the extracted text and page number
        print(f"Extracted Text for {path}: {text[:100]}...")  # First 100 characters
        print(f"Page Number: {page_number}")

        # Ensure text is a valid string
        text = str(text).strip() if text else "No text extracted"

        metadata = {
            "Image Path": path,
            "Extracted Text": text,
            "Page Number": page_number
        }
        pages.append(metadata)
    return pages

def save_to_json(metadata_list, output_dir, name, author):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Construct the output file name
    file_name = f"{name}_{author}.json"
    output_path = os.path.join(output_dir, file_name)

    # Add document-level metadata
    document_metadata = {
        "Author": author,
        "Name": name,
        "Date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "Pages": metadata_list
    }

    # Save metadata to a JSON file
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(document_metadata, json_file, indent=4, ensure_ascii=False)
    print(f"JSON saved to: {output_path}")

def main(
    image_folder,
    output_dir,
    name,
    author,
    page_location='top'
):
    # Retrieve image paths
    image_paths = get_image_paths_from_folder(image_folder)
    if not image_paths:
        print(f"No images found in folder: {image_folder}")
        return

    # Process images and extract text with metadata
    metadata_list = process_images(image_paths, page_location)

    # Save metadata and text to a JSON file
    save_to_json(metadata_list, output_dir, name, author)

if __name__ == "__main__":
    config = {
        'image_folder': r'data\Photos\Livre Mémoires 1-50',  # Replace with your folder path
        'output_dir': r'data\Family safe',
        'name': "Pour la mémoire familiale 1-50",  # Replace with the document name
        'author': "Jean Lambert",  # Replace with the author name
        'page_location': 'top',  # Options: 'top', 'bottom', 'none'
    }

    main(
        image_folder=config['image_folder'],
        output_dir=config['output_dir'],
        name=config['name'],
        author=config['author'],
        page_location=config['page_location']
    )


Extracted Text for data\Photos\Livre Mémoires 1-50\20250127_142502.jpg: POUR LA MÉMOIRE
FAMILIALE

FAMILLE HISTOIRE
SOUVENIRS & COMMENTAIRES

VOLUME 1

Jean-Georges Lambert...
Page Number: 1
Extracted Text for data\Photos\Livre Mémoires 1-50\20250127_142520.jpg: J'ai entrepris ce travail pour mes fils qui tiennent, à parts égales, tant de place dans ma vie.  Je...
Page Number: None
Extracted Text for data\Photos\Livre Mémoires 1-50\20250127_142527.jpg: VOLUME 1

Page
6- Préambule.

TABLE DES MATIERES DES VOLUMES 1 & 2
-TONE 1-MA FAMILLE PATERNELLE -

...
Page Number: None
Extracted Text for data\Photos\Livre Mémoires 1-50\20250127_142533.jpg: -Les sentiments de Bonaparte, puis de Napoléon, envers les juifs -Les décrets de 1808 -
Vers l'égali...
Page Number: None
Extracted Text for data\Photos\Livre Mémoires 1-50\20250127_142539.jpg: sont pas seuls visés par cette campagne Mon père et l'antisémitisme avant l'affaire
Drevfus L'Affair...
Page Number: None
Extracted Text for data\Photos\Li

# Gemini OCR

In [10]:
import os
import pandas as pd
import json
from PIL import Image
import google.generativeai as genai
from datetime import datetime
import cv2
import re

# Replace with your actual API key
genai.configure(api_key=GEMINI_KEY)
model = genai.GenerativeModel("gemini-1.5-flash-latest")

def get_image_paths_from_folder(folder_path, extensions=('.jpg', '.jpeg', '.png')):
    return [
        os.path.join(folder_path, file)
        for file in os.listdir(folder_path)
        if file.lower().endswith(extensions)
    ]

def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    return thresh

def extract_page_region(image_path, location='top'):
    image = Image.open(image_path)
    width, height = image.size
    if location == 'top':
        return image.crop((0, 0, width, int(height * 0.1)))  # Top 10%
    elif location == 'bottom':
        return image.crop((0, int(height * 0.9), width, height))  # Bottom 10%
    return None

def extract_text_and_page_number(image_path, page_location='top'):
    try:
        # Extract full text from the entire image
        image = Image.open(image_path)
        prompt_full = f"Extract the full text from the provided image."
        generation_config = genai.types.GenerationConfig(
            temperature=0.1,
            max_output_tokens=1024
        )
        response_full = model.generate_content([prompt_full, image], generation_config=generation_config)
        full_text = response_full.text

        # Extract page number from the specified region
        page_number = None
        if page_location in ['top', 'bottom']:
            region = extract_page_region(image_path, location=page_location)
            if region:
                prompt_page = f"Identify the page number from this image region."
                response_page = model.generate_content([prompt_page, region], generation_config=generation_config)
                page_text = response_page.text

                # Use regex to find the page number
                page_number_match = re.search(r'\b\d+\b', page_text)
                page_number = int(page_number_match.group()) if page_number_match else None

        return full_text, page_number
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return "", None

def process_images(image_paths, page_location='top'):
    pages = []
    for path in image_paths:
        text, page_number = extract_text_and_page_number(path, page_location=page_location)

        # Debugging: Print the extracted text and page number
        print(f"Extracted Text for {path}: {text[:100]}...")  # First 100 characters
        print(f"Page Number: {page_number}")

        # Ensure text is a valid string
        text = str(text).strip() if text else "No text extracted"

        metadata = {
            "Image Path": path,
            "Extracted Text": text,
            "Page Number": page_number
        }
        pages.append(metadata)
    return pages

def save_to_json(metadata_list, output_dir, name, author, type):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Construct the output file name
    file_name = f"{name}_{author}.json"
    output_path = os.path.join(output_dir, file_name)

    # Add document-level metadata
    document_metadata = {
        "Author": author,
        "Name": name,
        "Date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "Pages": metadata_list,
        "Type": type
    }

    # Save metadata to a JSON file
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(document_metadata, json_file, indent=4, ensure_ascii=False)
    print(f"JSON saved to: {output_path}")

def main(
    image_folder,
    output_dir,
    name,
    author,
    page_location='top'
):
    # Retrieve image paths
    image_paths = get_image_paths_from_folder(image_folder)
    if not image_paths:
        print(f"No images found in folder: {image_folder}")
        return

    # Process images and extract text with metadata
    metadata_list = process_images(image_paths, page_location)

    # Save metadata and text to a JSON file
    save_to_json(metadata_list, output_dir, name, author, type=config['type'])

if __name__ == "__main__":
    config = {
        'image_folder': r'Photos\la belle',  # Replace with your folder path
        'output_dir': r'C:\Users\aurel\OneDrive\Documents\Python\IronHack\Project\Family safe',
        'name': "La belle au bois dormant",  # Replace with the document name
        'author': "Perrault",  # Replace with the author name
        'page_location': 'bottom',  # Options: 'top', 'bottom', 'none'
        'type': 'scan'
    }

    main(
        image_folder=config['image_folder'],
        output_dir=config['output_dir'],
        name=config['name'],
        author=config['author'],
        page_location=config['page_location']
    )


FileNotFoundError: [WinError 3] Le chemin d’accès spécifié est introuvable: 'Photos\\la belle'