# Extract brand compliance information using PyMuPDF 

In [15]:
import fitz  # PyMuPDF

def extract_compliance_elements(pdf_path):
    doc = fitz.open(pdf_path)
    results = {
        "font_style": None,
        "logo_safe_zone": None,
        "logo_colours": None,
        "colour_palette": None
    }

    for page_num, page in enumerate(doc, start=1):
        text = page.get_text().lower()

        if "typography" in text or "lexend" in text or "inter" in text:
            results["font_style"] = f"Found on page {page_num}"

        if "safe zone" in text and "30px" in text:
            results["logo_safe_zone"] = f"Found on page {page_num}"

        if "do not" in text and "logo" in text:
            results["logo_colours"] = f"Found on page {page_num}"

        if "#85a0fe" in text or "primary brand colors" in text or "lavender" in text:
            results["colour_palette"] = f"Found on page {page_num}"

    doc.close()
    return results

# Example usage
pdf_path = "Neurons_brand_kit.pdf"
compliance = extract_compliance_elements(pdf_path)

print("Compliance Information Summary:")
for key, value in compliance.items():
    print(f"{key.replace('_', ' ').title()}: {value or 'Not found'}")


Compliance Information Summary:
Font Style: Found on page 9
Logo Safe Zone: Found on page 6
Logo Colours: Found on page 6
Colour Palette: Found on page 8


In [16]:
import fitz  # PyMuPDF

# Open a PDF file
doc = fitz.open("Neurons_brand_kit.pdf")

# Iterate through pages and extract text
for page_num in range(len(doc)):
    page = doc[page_num]
    text = page.get_text()
    print(f"--- Page {page_num + 1} ---")
    print(text)

doc.close()


--- Page 1 ---
Brand Guidelines

--- Page 2 ---
Enabling marketers to 
make better descisions

--- Page 3 ---
04logo
05Logo misuse
06logo Safe zone
07colors
08extended colors
09Typography
Table of contents

--- Page 4 ---
Brand guidelines
Logo
2024
About The Logo
We are extremely proud of our Logo, and we would appreciate if you could follow 
these guidlines whenever you use it, to make sure it always shows its best.

As Neurons has evolved to focus much more on customer prediction rather than 
customer insights, our brand has also evolved.
Our Logo embodies much of Neurons’ bold spirit and innovative nature. The Logo is 
the combination of a simple and modern Typeface (Neurons) with the Icon. 
The Icon captures different meanings core to our brand  – simplicity, direction and 
iteration.

--- Page 5 ---
neurons
neurons
neurons
Do not crop the Logo
Do not disort the Logo
Do not use drop 
shadows or any 
other effects
Do not use any 
other font for the 
Typeface
Do not outline the 
Icon

# Brand Compliance Information:

4 elements to be assessed for the overall grade in range [0,4], where each requirement contributes a single point to the overall grade:

- Font style.
- Logo safe zone.
- Logo colours.
- Colour palette (overall image).

Infer font style metadata: https://stackoverflow.com/questions/19386711/extract-text-from-pdf-in-respect-to-formatting-font-size-type-etc

In [17]:
import fitz  # PyMuPDF

def extract_font_styles(pdf_path):
    doc = fitz.open(pdf_path)
    font_styles = {}

    for page in doc:
        text = page.get_text()
        lines = text.splitlines()

        for i, line in enumerate(lines):
            if line.strip().lower() == "primary" and i + 1 < len(lines):
                font_styles["Primary"] = lines[i + 1].strip()
            elif line.strip().lower() == "secondary" and i + 1 < len(lines):
                font_styles["Secondary"] = lines[i + 1].strip()

    doc.close()
    return font_styles

# Example usage:
pdf_path = "Neurons_brand_kit.pdf"
fonts = extract_font_styles(pdf_path)

print(fonts)


{'Primary': 'Lexend', 'Secondary': 'Inter'}


In [18]:
def extract_logo_safezone_styles(pdf_path):
    doc = fitz.open(pdf_path)
    logo_safezone = {}

    for page in doc:
        text = page.get_text()
        if "the safe zone" in text.lower():
            lines = text.splitlines()
            buffer = []
            found_section = False

            for i, line in enumerate(lines):
                if "the safe zone" in line.lower():
                    found_section = True
                    buffer.append(line.strip())
                    continue
                if found_section:
                    # Stop if we've reached a new section (heuristic: a blank line or unrelated header)
                    if line.strip() == "" or line.strip().lower() in ["yes", "no"]:
                        break
                    buffer.append(line.strip())

            # Join the buffer into a paragraph
            safezone_text = " ".join(buffer)

            # Extract value and requirements from the paragraph
            if "x is" in safezone_text.lower():
                # Extract the value using a simple split
                value_part = next((s for s in safezone_text.split(".") if "x is" in s.lower()), "").strip()
                logo_safezone["Value"] = value_part

            # Everything else is the requirement
            requirement_part = safezone_text.replace(value_part, "").strip()
            logo_safezone["Requirements"] = requirement_part

            break  # Stop after first match

    doc.close()
    return logo_safezone

# Example usage
pdf_path = "Neurons_brand_kit.pdf"
safezone = extract_logo_safezone_styles(pdf_path)
print(safezone)

{'Value': 'X is 30px', 'Requirements': 'The Safe Zone The minimum required clear space is defined by measurement “X”. . The Safe Zone should not contain any graphic or typhographic elements other than the logotype itself. neurons Lorem ipsum dolor sit amet'}


In [19]:
def extract_logo_colours(pdf_path):
    doc = fitz.open(pdf_path)
    logo_colour = {"Logo colours": []}

    for page in doc:
        text = page.get_text()
        lines = text.splitlines()

        if "primary" in text:
            for line in lines:
                # Just simply check if a line starts with "#" - if it does, we assume it is a colour and we append it. There is a large palette of colours, so we just extract each one, and return them in a dict.
                if line.strip().startswith("#"):
                    logo_colour["Logo colours"].append(line.strip())

    doc.close()
    return logo_colour

pdf_path = "Neurons_brand_kit.pdf"
colours = extract_logo_colours(pdf_path)
print(colours)

{'Logo colours': ['#85A0FE', '#AA82FF', '#FE839C', '#FFD14C', '#380F57']}


In [20]:

# This has to be the palette, excluding the brand colours, I assume:
def extract_palette_styles(pdf_path):
    doc = fitz.open(pdf_path)
    logo_colour_palette = {"Colours": []}

    for page in doc:
        text = page.get_text()
        lines = text.splitlines()

        for line in lines:
            # Just simply check if a line starts with "#" - if it does, we assume it is a colour and we append it. There is a large palette of colours, so we just extract each one, and return them in a dict.
            if line.strip().startswith("#"):
                logo_colour_palette["Colours"].append(line.strip())

    doc.close()
    return logo_colour_palette

pdf_path = "Neurons_brand_kit.pdf"
colours = extract_palette_styles(pdf_path)
print(colours)

{'Colours': ['#85A0FE', '#AA82FF', '#FE839C', '#FFD14C', '#380F57', '#FD6483', '#FE839C', '#FEA8BA', '#FFC1CD', '#FFCDD7', '#FFECF0', '#6184FF', '#6184FF', '#85A0FE', '#AABDFE', '#C2CFFF', '#CED9FF', '#EDF1FF', '#F3F6FF', '#FFC51F', '#FFD14C', '#FFDF82', '#FFE8A5', '#FFEDB7', '#FFF8E4', '#FFFAED', '#290A40', '#380F57', '#745789', '#9C87AB', '#AF9FBC', '#E1DBE6', '#EBE7EE', '#444343', '#666666', '#858585', '#A3A3A3', '#C2C2C2', '#E0E0E0', '#F0F0F0', '#AA82FF', '#C5ABFD', '#D1BCFE', '#DCCDFE', '#E8DDFE', '#F3EEFF', '#F9F7FF', '#074DE4', '#346CE5', '#5D89EA', '#85A7EF', '#AEC4F5', '#D6E2FA', '#EBF0FC', '#00CF53', '#25DD6F', '#50E48B', '#7CEBA8', '#A8F1C5', '#D3F8E2', '#E9FCF1', '#FF8B22', '#FFA049', '#FFB36D', '#FFC692', '#FFD9B6', '#FFECDB', '#FFF6ED', '#E0414A', '#DE5E65', '#E57F84', '#EB9FA3', '#F2BFC2', '#F8DFE0', '#FCEFF0', '#41EFC6', '#89EFD7', '#A1F2DF', '#B9F5E7', '#E7FCF7', '#F3FDFB']}


In [23]:
# The function that will be used in the API
# Make it more readable what each is? Just return a string along with it?
def extract_brand_compliance(pdf_path):
    return {
        "font_styles": extract_font_styles(pdf_path),
        "logo_safezone": extract_logo_safezone_styles(pdf_path),
        "logo_colour": extract_logo_colours(pdf_path),
        "logo_colour_palette": extract_palette_styles(pdf_path)
    }

# Test
pdf_path = "Neurons_brand_kit.pdf"
final = extract_brand_compliance(pdf_path)
print(final)

# Example of how we might call it in the API
print(final["font_styles"])
print(final["logo_safezone"])

{'font_styles': {'Primary': 'Lexend', 'Secondary': 'Inter'}, 'logo_safezone': {'Value': 'X is 30px', 'Requirements': 'The Safe Zone The minimum required clear space is defined by measurement “X”. . The Safe Zone should not contain any graphic or typhographic elements other than the logotype itself. neurons Lorem ipsum dolor sit amet'}, 'logo_colour': {'Logo colours': ['#85A0FE', '#AA82FF', '#FE839C', '#FFD14C', '#380F57']}, 'logo_colour_palette': {'Colours': ['#85A0FE', '#AA82FF', '#FE839C', '#FFD14C', '#380F57', '#FD6483', '#FE839C', '#FEA8BA', '#FFC1CD', '#FFCDD7', '#FFECF0', '#6184FF', '#6184FF', '#85A0FE', '#AABDFE', '#C2CFFF', '#CED9FF', '#EDF1FF', '#F3F6FF', '#FFC51F', '#FFD14C', '#FFDF82', '#FFE8A5', '#FFEDB7', '#FFF8E4', '#FFFAED', '#290A40', '#380F57', '#745789', '#9C87AB', '#AF9FBC', '#E1DBE6', '#EBE7EE', '#444343', '#666666', '#858585', '#A3A3A3', '#C2C2C2', '#E0E0E0', '#F0F0F0', '#AA82FF', '#C5ABFD', '#D1BCFE', '#DCCDFE', '#E8DDFE', '#F3EEFF', '#F9F7FF', '#074DE4', '#346CE5

NOTE: The logo colours are also a part of the palette, so why need two different functions? Maybe ask them at some point