In [None]:
!pip install matplotlib lxml



Parse and Visualize an INKML File

In [None]:
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

def parse_inkml(file_path):
    """Parse an InkML file to extract stroke and annotation information."""
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract annotations
    annotations = {}
    for annotation in root.findall('{http://www.w3.org/2003/InkML}annotation'):
        annotation_type = annotation.get('type')
        annotations[annotation_type] = annotation.text

    # Extract traces
    traces = []
    for trace in root.findall('{http://www.w3.org/2003/InkML}trace'):
        points = trace.text.strip().split(',')
        stroke = [(float(p.split()[0]), float(p.split()[1])) for p in points]
        traces.append(stroke)

    return traces, annotations

def visualize_inkml(traces, img_size=(256, 256), save_path="output_image.png", show=True):
    """Visualize InkML traces using Matplotlib and save the image as a file."""
    # Create a Matplotlib figure and axis
    fig, ax = plt.subplots(figsize=(img_size[0] / 100, img_size[1] / 100), dpi=100)

    # Flatten all x and y points for scaling
    all_x = [x for stroke in traces for x, y in stroke]
    all_y = [y for stroke in traces for x, y in stroke]

    if len(all_x) == 0 or len(all_y) == 0:
        plt.close(fig)
        blank_image = np.ones(img_size) * 255  # Return a blank white image if no strokes
        blank_img = Image.fromarray(blank_image.astype(np.uint8))
        blank_img.save(save_path)
        return

    # Draw each stroke on the plot
    for stroke in traces:
        stroke_x = [point[0] for point in stroke]
        stroke_y = [-point[1] for point in stroke]  # Negate y to correct orientation
        ax.plot(stroke_x, stroke_y, 'k-', linewidth=2)

    # Hide axes
    ax.axis('off')

    # Adjust the layout to fit the strokes tightly
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

    # Save the figure as an image
    fig.canvas.draw()

    # Extract the image as an RGBA buffer
    img = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
    img = img.reshape(fig.canvas.get_width_height()[::-1] + (4,))  # Note the 4 channels (RGBA)

    # Convert to grayscale by averaging across RGB channels (ignore alpha)
    gray_img = np.mean(img[..., :3], axis=2).astype(np.uint8)

    # Save the image using PIL
    pil_image = Image.fromarray(gray_img)
    pil_image.save(save_path)

    # Optionally display the image
    if show:
        plt.show()
    # Close the figure to free resources
    plt.close(fig)

def main(file_path):
    """Main function to parse and visualize a single InkML file."""
    try:
        # Parse the InkML file
        traces, annotations = parse_inkml(file_path)

        # Print the annotations
        print("File Annotations:")
        for key, value in annotations.items():
            print(f"{key}: {value}")

        # Visualize the traces
        visualize_inkml(traces)

    except Exception as e:
        print(f"Error processing InkML file: {e}")

if __name__ == "__main__":
    inkml_file_path = folder_path +"synthetic/000a192d6e72d170.inkml" # File path to the INKML file
    main(inkml_file_path)

Error processing InkML file: [Errno 2] No such file or directory: '/content/drive/MyDrive/3312_images/synthetic_images/017054e68411efd6.inkml'


Preprocess the whole Dataset into Images and Labels (Latex formulas)

In [None]:
import os
from tqdm import tqdm
from PIL import Image
import numpy as np
import xml.etree.ElementTree as ET

def process_inkml_folder(inkml_folder, output_image_folder, output_label_file, max_samples=None, img_size=(256, 256)):
    """Process all InkML files in a folder, save the output images and labels."""
    if not os.path.exists(output_image_folder):
        os.makedirs(output_image_folder)

    inkml_files = [f for f in os.listdir(inkml_folder) if f.endswith(".inkml")]
    if max_samples is not None:
        inkml_files = inkml_files[:max_samples]

    with open(output_label_file, "w") as label_file:
        # Iterate through all .inkml files in the folder with a progress bar
        for filename in tqdm(inkml_files, desc="Processing InkML files"):
            file_path = os.path.join(inkml_folder, filename)
            try:
                # Parse the InkML file
                traces, annotations = parse_inkml(file_path)

                # Save the image
                image_name = os.path.splitext(filename)[0] + ".png"
                image_save_path = os.path.join(output_image_folder, image_name)
                visualize_inkml(traces, img_size=img_size, save_path=image_save_path, show=False)

                # Save the label (LaTeX annotation)
                label_file.write(f"{image_name}\t{annotations['label']}\n")

            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")

    print(f"Finished processing {len(inkml_files)} files.")

inkml_folder = folder_path+"synthetic"
output_image_folder = "images"
output_label_file = "labels.txt"

process_inkml_folder(inkml_folder, output_image_folder, output_label_file, max_samples=3000)

Processing InkML files: 100%|██████████| 100/100 [00:05<00:00, 18.26it/s]

Finished processing 100 files.





In [1]:
import os
from tqdm import tqdm
from PIL import Image
import numpy as np
import xml.etree.ElementTree as ET
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=False)

# Define path in your Google Drive for saving results
my_drive_path = "/content/drive/My Drive/Senior Year/Spring Semester/CSCI 5527/CSCI 5527 Project/Data/preprocessed"

def process_inkml_folder(inkml_folder, output_image_folder, output_label_file, max_samples=None, img_size=(256, 256)):
    """Process all InkML files in a folder, save the output images and labels."""
    if not os.path.exists(output_image_folder):
        os.makedirs(output_image_folder)
    inkml_files = [f for f in os.listdir(inkml_folder) if f.endswith(".inkml")]
    if max_samples is not None:
        inkml_files = inkml_files[:max_samples]
    with open(output_label_file, "w") as label_file:
        # Iterate through all .inkml files in the folder with a progress bar
        for filename in tqdm(inkml_files, desc="Processing InkML files"):
            file_path = os.path.join(inkml_folder, filename)
            try:
                # Parse the InkML file
                traces, annotations = parse_inkml(file_path)
                # Save the image
                image_name = os.path.splitext(filename)[0] + ".png"
                image_save_path = os.path.join(output_image_folder, image_name)
                visualize_inkml(traces, img_size=img_size, save_path=image_save_path, show=False)
                # Save the label (LaTeX annotation)
                label_file.write(f"{image_name}\t{annotations['label']}\n")
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
    print(f"Finished processing {len(inkml_files)} files.")

# Define paths for saving on Google Drive
inkml_folder = "/content/drive/My Drive/Senior Year/Spring Semester/CSCI 5527/CSCI 5527 Project/Data/mathwriting-2024_full/mathwriting-2024/train"
output_image_folder = os.path.join(my_drive_path, "images")
output_label_file = os.path.join(my_drive_path, "labels.txt")

# Create the directory if it doesn't exist
if not os.path.exists(my_drive_path):
    os.makedirs(my_drive_path)

# Process the files
process_inkml_folder(inkml_folder, output_image_folder, output_label_file, max_samples=3000)

Mounted at /content/drive


OSError: [Errno 5] Input/output error: '/content/drive/My Drive/Senior Year/Spring Semester/CSCI 5527/CSCI 5527 Project/Data/mathwriting-2024_full/mathwriting-2024/train'