In [3]:
import pandas as pd
from lxml import etree
import datetime 
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom.minidom import parseString

In [7]:
#render the csv into page xml, with the coords and text

def arrange_rectangle_coords_from_string_to_string(coord_str):
    # Split the string into a list of tuples
    coords = [tuple(map(int, coord.split(','))) for coord in coord_str.split()]

    if len(coords) != 4:
        raise ValueError("Four coordinates are required to form a rectangle.")

    # Extract coordinates
    x_coords = [coord[0] for coord in coords]
    y_coords = [coord[1] for coord in coords]

    # Find the corners
    top_left = (min(x_coords), min(y_coords))
    top_right = (max(x_coords), min(y_coords))
    bottom_right = (max(x_coords), max(y_coords))
    bottom_left = (min(x_coords), max(y_coords))

    # Arrange in string format
    rectangle_string = f"{top_left[0]},{top_left[1]} {top_right[0]},{top_right[1]} {bottom_right[0]},{bottom_right[1]} {bottom_left[0]},{bottom_left[1]}"

    return rectangle_string

def create_page_xml(row):
    # Create the XML structure
    pcgts = Element('PcGts', {
        'xmlns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15',
        'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
        'xsi:schemaLocation': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd'
    })
    metadata = SubElement(pcgts, 'Metadata')
    creator = SubElement(metadata, 'Creator')
    creator.text = 'escriptorium'
    created = SubElement(metadata, 'Created')
    created.text = '2024-02-18T21:02:47.785694+00:00'
    last_change = SubElement(metadata, 'LastChange')
    last_change.text = '2024-02-20T08:09:51.332236+00:00'

    page = SubElement(pcgts, 'Page', {
        'imageFilename': row['file_name']
    })

    # Add regions with coordinates
    regions = {}

    for region_type in ['ocr_writen_on_coords', 'ocr_writen_by_coords', 'ocr_main_content_coords', 'ocr_additional_content_coords']:
        if pd.notna(row[region_type]):
            coords_str = arrange_rectangle_coords_from_string_to_string(row[region_type])
            text_column_name = region_type.replace('_coords', '')
            text_content = row[text_column_name] if pd.notna(row[text_column_name]) else ''

            if coords_str in regions:
                regions[coords_str] += ' ' + text_content
            else:
                regions[coords_str] = text_content

    for coords_str, text_content in regions.items():
        text_region = SubElement(page, 'TextRegion', {'id': f'eSc_region_{coords_str.replace(",", "_").replace(" ", "_")}', 'custom': 'structure {type:text;}'})
        coords = SubElement(text_region, 'Coords', {'points': coords_str})

        # Add a default text line with the same coordinates as the region
        text_line = SubElement(text_region, 'TextLine', {'id': f'eSc_line_{coords_str.replace(",", "_").replace(" ", "_")}', 'custom': 'structure {type:default;}'})
        line_coords = SubElement(text_line, 'Coords', {'points': coords_str})

        # Add a baseline with only the bottom line of the rectangle
        _,  _, bottom_left, bottom_right = coords_str.split()
        baseline_points = f"{bottom_right} {bottom_left}"
        baseline = SubElement(text_line, 'Baseline', {'points': baseline_points})

        text_equiv = SubElement(text_line, 'TextEquiv')
        unicode_element = SubElement(text_equiv, 'Unicode')
        unicode_element.text = text_content.strip()


    # Convert to a pretty XML string
    xml_str = tostring(pcgts)
    pretty_xml_str = parseString(xml_str).toprettyxml()

    return pretty_xml_str

def process_csv_to_xml(csv_file):
    df = pd.read_csv(csv_file, encoding='utf8')
    for _, row in df.iterrows():
        xml_str = create_page_xml(row)
        # Save the XML to a file
        with open(f'XMLS/{row["file_name"]}.xml', 'w', encoding='utf8') as f:
            f.write(xml_str)
    return df
            
# Example usage
csv_file = 'gnazim_db_meta_data_2k_fixed_coords_utf8.csv'
df = process_csv_to_xml(csv_file)


In [8]:
df.head()

Unnamed: 0,identifier,path,gcp_file_id,folder_name,author_subject,type,Years,gcp_folder_id,file_name,ocr_writen_on,...,ocr_additional_content_coords,paragraphs_detection_successes,ocr_all_text_preprocess,ocr_all_text_no_preprocess,ocr_main_content_all_text_preprocess,ocr_main_content_all_text_no_preprocess,years,gcp_image_link,gcp_folder_link,is_handwritten
0,IDGNAZIM0001,"\CD00085 גורי, חיים - גורי, חיים\2266.tif",1ASOIID456UbRActNY4dcTHyuq0vcow-z,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2266.tif,]ערה,...,"485,696 550,696 485,812 550,812",True,"סי | כב // / ָ 9 ה [ערה, ס.ל, /",,"ָ 9 ה [ערה, ס.ל, /",,,https://drive.google.com/file/d/1ASOIID456UbRA...,https://drive.google.com/drive/folders/1c6Bz4o...,1
1,IDGNAZIM0002,"\CD00085 גורי, חיים - גורי, חיים\2277.tif",10GB_ugUJSVRfVZFXvLpiOCH9dK6l2-1S,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2277.tif,"גורי,חיים, עוזר יוסף",...,,True,"גורי,חיים, עוזר יוסף הנה מוסלות טליתותינו יוסף...","גורי,תיים 0 , עוזר יוסף הנח מוטלות טליתותינו י...","ו יוסף עוזר משורר סחזר בתצטובר, בטיחוהת מפגלצ ...",תינו יוסף עוזר משורר שחזר בתשובה בשיחות מפגש ע...,,https://drive.google.com/file/d/10GB_ugUJSVRfV...,https://drive.google.com/drive/folders/1c6Bz4o...,0
2,IDGNAZIM0003,"\CD00085 גורי, חיים - גורי, חיים\2253.tif",1b-fyZK6WC0_FRQSWw2DmHnQQB4L0aHZ3,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2253.tif,-,...,"298,17 1204,17 298,162 1204,162",True,"%]ר, הו כ 3 א | ₪2 0 ת)שת | 4 א ג = תאו [ - , ...","%!ר, / 27 ז 3 ₪9 2 7 ץ . גי \ תאו [ -. 5 וע",,,,https://drive.google.com/file/d/1b-fyZK6WC0_FR...,https://drive.google.com/drive/folders/1c6Bz4o...,1
3,IDGNAZIM0004,"\CD00085 גורי, חיים - גורי, חיים\2264.tif",1WLWQ1PZ0zDlKsy6ULhPqmos_lo0qFoGK,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2264.tif,ים,...,"571,285 743,285 571,343 743,343",True,"גורי, חיים תקוות ""אנסי התושך"" שחלפה לפרתחב, 20...","תקופת ""אנטי התוטך"" שחלפה למרחתחב, 80.4.1956, ע...","אנסי התושך"" שחלפה לפרתחב, 20.4.1956, עס' 2","שחלפה למרחתחב, 80.4.1956, עמ' 8",,https://drive.google.com/file/d/1WLWQ1PZ0zDlKs...,https://drive.google.com/drive/folders/1c6Bz4o...,0
4,IDGNAZIM0005,"\CD00085 גורי, חיים - גורי, חיים\2272.tif",1mukseIvdGHRIHT0sa0Kl1y6eL1EVBYc3,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2272.tif,"גורי, חיים",...,"131,527 778,527 131,614 778,614",True,"גורי, חיים תס""תח - תסט""ז 5 7 (פה אומרים?) חתום...","גורי, חיים תטש""ח - תשט""צך (מה אומרים?) חתום ג ...","ום הג ) / לפרחב, 16.4.1956, עם' 2","ג למרהב, 16.4,1956, עמ' 2",,https://drive.google.com/file/d/1mukseIvdGHRIH...,https://drive.google.com/drive/folders/1c6Bz4o...,0


In [1]:
#initialize the drive
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

def establish_connection() -> GoogleDrive:
    """Establishes connection to Google Drive.

    Returns:
        An instance of GoogleDrive which is authenticated and ready to use.
    """
    gauth = GoogleAuth()
    gauth.LocalWebserverAuth()  # This will open a browser window for authentication.
    return GoogleDrive(gauth)

def test_establish_connection():
    """Tests the establish_connection function by attempting to authenticate."""
    try:
        drive = establish_connection()
        print("Authentication successful!")
        return drive
    except Exception as e:
        print(f"Authentication failed: {e}")
        return None

def test_connection(drive: GoogleDrive):
    """Tests the connection to Google Drive by listing the first 5 files.

    Args:
        drive: An authenticated GoogleDrive instance.
    """
    if drive is None:
        print("Cannot test connection because authentication failed.")
        return

    try:
        file_list = drive.ListFile({'q': "'root' in parents and trashed=false", 'maxResults': 5}).GetList()
        print("Connection successful! Here are the first 5 files in your Google Drive:")
        for file in file_list:
            print(f'- {file["title"]}')
    except Exception as e:
        print(f"An error occurred while listing files: {e}")

if __name__ == "__main__":
    # Test the establish_connection function
    drive = test_establish_connection()

    # Test the connection
    test_connection(drive)


Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=1007301603530-of60vh7oh54n3ruao261tms36kehruan.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=online&response_type=code

Authentication successful.
Authentication successful!
Connection successful! Here are the first 5 files in your Google Drive:
- All Magazines All Object-Verb Sub-Corpus 2024-01-30.csv
- Chapter 3, datafication and digital philology.pptx
- תמונה הלל.jpeg
- All Magazines Object-Verb Sub-Corpus 2024-01-09.csv
- All Magazines Object-Verb Sub-Corpus 2023-12-25.csv


In [None]:
#now this works, after initializing above. make sure no localhost8080 are running
file_id = '1cGPC3OCLB58l9jYut--yhmgo8VMzpG2e'

# Path where the image will be saved
save_path = 'downloaded_image.tif'

def download_image(drive: GoogleDrive, file_id: str, save_path: str):
    """Downloads an image from Google Drive.

    Args:
        drive: An authenticated GoogleDrive instance.
        file_id: The ID of the file to download.
        save_path: The path where the image will be saved.
    """
    file = drive.CreateFile({'id': file_id})
    file.GetContentFile(save_path)
    print(f"Image downloaded and saved as {save_path}")

# Download the image
download_image(drive, file_id, save_path)

In [17]:
row['gcp_image_link']

'https://drive.google.com/file/d/1cGPC3OCLB58l9jYut--yhmgo8VMzpG2e'

In [None]:
#this is the loop for downloading the images
for index, row in df.iterrows():
    file_id = row['gcp_file_id']
    destination = f"Images/{row['identifier']}.tif"  # Adjust the path as needed
    download_image(drive, file_id, destination)

In [117]:
#display the boxes of an image according to the coordinates 
import pandas as pd
from PIL import Image, ImageDraw

# Initialize the image filename
image_filename = "2272.tif"

# Load the CSV file into a DataFrame with the specified encoding
csv_file = 'gnazim_db_meta_data_2k_fixed_coords_utf8.csv'
df = pd.read_csv(csv_file, encoding="utf8")

# Filter the DataFrame for the row with the specified image filename
row = df[df['file_name'] == image_filename]

# Extract the coordinates from the appropriate columns
ocr_writen_on_coords = row['ocr_writen_on_coords'].values[0]
ocr_writen_by_coords = row['ocr_writen_by_coords'].values[0]
ocr_main_content_coords = row['ocr_main_content_coords'].values[0]
ocr_additional_content_coords = row['ocr_additional_content_coords'].values[0]

# Convert the coordinate strings to lists of tuples
def convert_to_tuples(coord_str):
    if pd.isna(coord_str):
        return []
    return [(int(x.split(',')[0]), int(x.split(',')[1])) for x in coord_str.split()]

ocr_writen_on_coords = convert_to_tuples(ocr_writen_on_coords)
ocr_writen_by_coords = convert_to_tuples(ocr_writen_by_coords)
ocr_main_content_coords = convert_to_tuples(ocr_main_content_coords)
ocr_additional_content_coords = convert_to_tuples(ocr_additional_content_coords)

# Load the image and draw the coordinates
image = Image.open(image_filename)
if image.mode != 'RGB':
    image = image.convert('RGB')

# Create a drawing context
draw = ImageDraw.Draw(image)

def draw_rectangle_from_coords(coords, color):
    if coords:
        # Assume the coordinates are in the order: top-left, top-right, bottom-right, bottom-left
        top_left = coords[0]
        bottom_right = coords[-1]
        draw.rectangle([top_left, bottom_right], outline=color, width=3)
        print('tried to draw...')

draw_rectangle_from_coords(ocr_writen_on_coords, 'red')
draw_rectangle_from_coords(ocr_writen_by_coords, 'green')
draw_rectangle_from_coords(ocr_main_content_coords, 'blue')
draw_rectangle_from_coords(ocr_additional_content_coords, 'yellow')

# Show the image
image.show()


tried to draw...
tried to draw...
tried to draw...
tried to draw...


In [27]:
image = Image.open(image_filename)
image = image.convert('RGB')
draw.rectangle([50, 50, 200, 200], outline='green', width=3)
image.show()


In [118]:
image.close()

In [122]:
df

Unnamed: 0,identifier,path,gcp_file_id,folder_name,author_subject,type,Years,gcp_folder_id,file_name,ocr_writen_on,...,ocr_additional_content_coords,paragraphs_detection_successes,ocr_all_text_preprocess,ocr_all_text_no_preprocess,ocr_main_content_all_text_preprocess,ocr_main_content_all_text_no_preprocess,years,gcp_image_link,gcp_folder_link,is_handwritten
0,IDGNAZIM0001,"\CD00085 גורי, חיים - גורי, חיים\2266.tif",1ASOIID456UbRActNY4dcTHyuq0vcow-z,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2266.tif,]ערה,...,"485,696 550,696 485,812 550,812",True,"סי | כב // / ָ 9 ה [ערה, ס.ל, /",,"ָ 9 ה [ערה, ס.ל, /",,,https://drive.google.com/file/d/1ASOIID456UbRA...,https://drive.google.com/drive/folders/1c6Bz4o...,1
1,IDGNAZIM0002,"\CD00085 גורי, חיים - גורי, חיים\2277.tif",10GB_ugUJSVRfVZFXvLpiOCH9dK6l2-1S,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2277.tif,"גורי,חיים, עוזר יוסף",...,,True,"גורי,חיים, עוזר יוסף הנה מוסלות טליתותינו יוסף...","גורי,תיים 0 , עוזר יוסף הנח מוטלות טליתותינו י...","ו יוסף עוזר משורר סחזר בתצטובר, בטיחוהת מפגלצ ...",תינו יוסף עוזר משורר שחזר בתשובה בשיחות מפגש ע...,,https://drive.google.com/file/d/10GB_ugUJSVRfV...,https://drive.google.com/drive/folders/1c6Bz4o...,0
2,IDGNAZIM0003,"\CD00085 גורי, חיים - גורי, חיים\2253.tif",1b-fyZK6WC0_FRQSWw2DmHnQQB4L0aHZ3,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2253.tif,-,...,"298,17 1204,17 298,162 1204,162",True,"%]ר, הו כ 3 א | ₪2 0 ת)שת | 4 א ג = תאו [ - , ...","%!ר, / 27 ז 3 ₪9 2 7 ץ . גי \ תאו [ -. 5 וע",,,,https://drive.google.com/file/d/1b-fyZK6WC0_FR...,https://drive.google.com/drive/folders/1c6Bz4o...,1
3,IDGNAZIM0004,"\CD00085 גורי, חיים - גורי, חיים\2264.tif",1WLWQ1PZ0zDlKsy6ULhPqmos_lo0qFoGK,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2264.tif,ים,...,"571,285 743,285 571,343 743,343",True,"גורי, חיים תקוות ""אנסי התושך"" שחלפה לפרתחב, 20...","תקופת ""אנטי התוטך"" שחלפה למרחתחב, 80.4.1956, ע...","אנסי התושך"" שחלפה לפרתחב, 20.4.1956, עס' 2","שחלפה למרחתחב, 80.4.1956, עמ' 8",,https://drive.google.com/file/d/1WLWQ1PZ0zDlKs...,https://drive.google.com/drive/folders/1c6Bz4o...,0
4,IDGNAZIM0005,"\CD00085 גורי, חיים - גורי, חיים\2272.tif",1mukseIvdGHRIHT0sa0Kl1y6eL1EVBYc3,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,2272.tif,"גורי, חיים",...,"131,527 778,527 131,614 778,614",True,"גורי, חיים תס""תח - תסט""ז 5 7 (פה אומרים?) חתום...","גורי, חיים תטש""ח - תשט""צך (מה אומרים?) חתום ג ...","ום הג ) / לפרחב, 16.4.1956, עם' 2","ג למרהב, 16.4,1956, עמ' 2",,https://drive.google.com/file/d/1mukseIvdGHRIH...,https://drive.google.com/drive/folders/1c6Bz4o...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,IDGNAZIM0002274,"\CD00085 גורי, חיים - גורי, חיים\0004.tif",1JTYY7KeITKvyuRI-rcIErLE3tX1awH6p,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,0004.tif,"ל, 2",...,,True,"ל, 2 ב13 ג'ג הר 5% חוכ 53 - 91% | / 57 כ כ רצי...",0 |( 9 5+ ת'ג כ 90| -912| / 25%כי- ו : -0 ה5כ ...,/ 57 כ כ רציי -07 5 ןו ] 7 ת!ג 64 2,"ו : -0 ה5כ ו ( 7% 7 תג 4 ,8 , 1. <2",,https://drive.google.com/file/d/1JTYY7KeITKvyu...,https://drive.google.com/drive/folders/1c6Bz4o...,1
2274,IDGNAZIM0002275,"\CD00085 גורי, חיים - גורי, חיים\0009.tif",1f4jiF2X6maCKKVf6QrhIWOycfxVZk8aB,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,0009.tif,גורי.חייס,...,"205,479 502,479 205,536 502,536",True,"גורי.חייס (אבינו סבטמים יתקדס סמר,-) , (סיץ מת...","גורי.חיים (אבינו שבטמים יתקדס שמך,-)| / (טיץ מ...","ים יתקדס סמר,-) , (סיץ מתור התילת הכופרים"" סגנ...","ים יתקדס שמך,-)| / (טיץ מתוך""תתפילת הכופרים"" ס...",,https://drive.google.com/file/d/1f4jiF2X6maCKK...,https://drive.google.com/drive/folders/1c6Bz4o...,0
2275,IDGNAZIM0002276,"\CD00085 גורי, חיים - גורי, חיים\0005.tif",1pqUFNIObU8iUAOIG7QeLs90HrFucSZhk,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,0005.tif,גורי חתיים,...,"458,582 1157,582 458,657 1157,657",True,"י, תיים אביב 1975 (מדור רטימות סוף סבוץ) | חתו...","גוהי, חייםי | . |אביב 1975 (מדור רשימות סוף שב...","ר דבר, סז ניפן תסל""ה,(28.0.75),?מ 2","| חתום צור) דבר, טז ניפן תשל""ה,(28.3.75),עמ 2",,https://drive.google.com/file/d/1pqUFNIObU8iUA...,https://drive.google.com/drive/folders/1c6Bz4o...,0
2276,IDGNAZIM0002277,"\CD00085 גורי, חיים - גורי, חיים\0003.tif",1JCCh65Ed6RQGH62XrNVaziOW0Isl24Tp,"\CD00085 גורי, חיים - גורי, חיים","גורי, חיים",,,1c6Bz4okJqtRXoTqvR4BXBxN3h1o71KDT,0003.tif,"גורי,חיים",...,"928,133 1168,133 928,206 1168,206",True,"גורי,חיים לכרחב, 199.₪.10, אבטירים [רבינה| [ ה...","גורי, חיים אבטיחים [רשימה] [חתפום:ג'ורי| למרחב...","9.₪.10, אבטירים [רבינה| [ הת וכ:ג","[רשימה] [חתפום:ג'ורי| למרחב,18.8,1955,פמ. 4",,https://drive.google.com/file/d/1JCCh65Ed6RQGH...,https://drive.google.com/drive/folders/1c6Bz4o...,0
