# **Libraries**


In [19]:
import requests
from pdf2image import convert_from_path
from PIL import Image
import os
import math

# PDF2Text | _Understanding the Formation of Galaxies with Warm Dark Matter_

In [20]:
url = "https://arxiv.org/pdf/2310.06882.pdf"
response = requests.get(url)
with open("../data/v3/pdf/paper.pdf", "wb") as f:
    f.write(response.content)

# PDF2Image


In [30]:
pages = convert_from_path("../data/v3/pdf/GA2023_FinalPaper.pdf", 500)
[
    os.remove(os.path.join("../data/v3/img", file))
    for file in os.listdir("../data/v3/img")
]
for index, page in enumerate(pages, start=1):
    page.save(f"../data/v3/img/paper_page_{index}.jpg", "JPEG")

**Consolidate Images Back Into One**


In [31]:
def extract_page_number(filename):
    return int(filename.split("_")[-1].split(".")[0])

In [32]:
def resize_image(img, target_width, target_height):
    """Resize the input image to the specified dimensions."""
    return img.resize((target_width, target_height), Image.LANCZOS)

In [33]:
def combine_and_save_images(images, output_path):
    """Combine a list of images horizontally and save the output."""
    combined_width = sum(img.width for img in images)
    combined_height = images[0].height
    combined_image = Image.new(
        "RGB", (combined_width, combined_height), (255, 255, 255)
    )

    x_offset = 0
    for img in images:
        combined_image.paste(img, (x_offset, 0))
        x_offset += img.width

    # Save the combined image
    combined_image.save(output_path)
    print(f"Combined and resized images saved at {output_path}")

In [34]:
def combine_images(input_directory, output_path):
    images = []

    # Load all images from the input directory
    for filename in sorted(os.listdir(input_directory), key=extract_page_number):
        if filename.endswith(".png") or filename.endswith(".jpg"):
            img_path = os.path.join(input_directory, filename)
            img = Image.open(img_path)
            images.append(img)

    # Calculate the number of rows and columns for the grid
    num_images = len(images)
    num_cols = int(math.ceil(math.sqrt(num_images)))
    num_rows = int(math.ceil(num_images / num_cols))

    # Calculate the total dimensions of the output image
    max_width = max(img.width for img in images)
    max_height = max(img.height for img in images)
    grid_width = max_width * num_cols
    grid_height = max_height * num_rows

    # Create a new blank image to paste the images onto
    combined_image = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))

    # Paste images onto the combined image
    for i, img in enumerate(images):
        col_idx = i % num_cols
        row_idx = i // num_cols
        x_offset = col_idx * max_width
        y_offset = row_idx * max_height

        combined_image.paste(img, (x_offset, y_offset))

    # Save the final combined image
    combined_image = resize_image(combined_image, 1000, 1000)
    combined_image.save(output_path)
    print(f"Combined images saved at {output_path}")

In [35]:
combine_images("../data/v3/img", "../data/v3/consolidated/paper.png")

Combined images saved at ../data/v3/consolidated/paper.png
