In [50]:
import sys
import os
import pandas as pd
import re 
from langchain_community.document_loaders import PyMuPDFLoader

# Dynamically resolve the base directory for Jupyter notebooks
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(BASE_DIR)

In [51]:
!ls {BASE_DIR}

README.md      [34mexamples[m[m       pyproject.toml uv.lock
[34mdata[m[m           main.py        [34msrc[m[m


In [52]:
def remove_footer(text: str) -> str:
    """
    Removes the footer from the given text.

    Args:
        text (str): The text containing the footer.

    Returns:
        str: The text with the footer removed.
    """
    footer_pattern = r"2025 Formula 1 Sporting Regulations\s+\d+/\d+\s+30 April 2025\s+©2025 Fédération Internationale de l’Automobile\s+Issue 5"
    cleaned_text = re.sub(footer_pattern, '', text)
    return cleaned_text

In [53]:
def extract_pdf_text_to_dataframe(pdf_path: str) -> pd.DataFrame:
    """
    Extracts text from a PDF file and returns it in a pandas DataFrame.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted text, with each page as a row.
    """
    try:
        
        # Load the document
        loader = PyMuPDFLoader(pdf_path,extract_tables='markdown')
        documents = loader.load_and_split()

        pages_text = [remove_footer(doc.page_content) for doc in documents]

        df = pd.DataFrame({'Page': range(1, len(pages_text) + 1), 'Text': pages_text})
        return df


    except Exception as e:
        raise RuntimeError(f"Error processing PDF file: {e}")



In [54]:
# Ensure the data directory path is resolved correctly
processed_data_dir = os.path.join(BASE_DIR, 'data', 'processed')
os.makedirs(processed_data_dir, exist_ok=True)

In [63]:
pdf_path = os.path.join(BASE_DIR, 'data', 'pdfs', 'FIA 2025 Formula 1 Sporting Regulations - Issue 5 - 2025-04-30.pdf')
df = extract_pdf_text_to_dataframe(pdf_path)
df.to_csv(os.path.join(processed_data_dir, 'pdf_text.csv'), index=False)
print(df['Text'][1])

 
 
1) 
REGULATIONS 
1.1 
The FIA will organise the FIA Formula One World Championship (the Championship) which is the 
property of the FIA and comprises two titles of World Champion, one for drivers and one for 
constructors. It consists of the Formula One Grand Prix races which are included in the Formula 
One calendar and in respect of which the ASNs and organisers have signed organisation 
agreements with the FIA. All the participating parties (FIA, ASNs, organisers, Competitors and 
circuits) undertake to apply as well as observe the rules governing the Championship and must 
hold the appropriate FIA Licences which are issued to drivers, Competitors, officials, organisers 
and circuits. 
1.2 
The final text of these Sporting Regulations shall be the English version which will be used should 
any dispute arise as to their interpretation. Headings in this document are for ease of reference 
only and do not form part of these Sporting Regulations. 
1.3 
These Sporting Regulations app

In [64]:
def extract_sections_to_dataframe(text: str) -> pd.DataFrame:
    """
    Extracts sections from the text based on patterns like 'NUMBER) A TITLE'
    and returns a DataFrame with 'title' and 'chunk' columns.

    Args:
        text (str): The text to process.

    Returns:
        pd.DataFrame: A DataFrame with 'title' and 'chunk' columns.
    """
    section_pattern = r"(\d+\)\s+[A-Z ]+)\n"
    matches = re.split(section_pattern, text)

    titles = []
    chunks = []

    for i in range(1, len(matches), 2):
        titles.append(matches[i].strip())
        chunks.append(matches[i + 1].strip())

    return pd.DataFrame({'title': titles, 'chunk': chunks})

# Process the extracted text into sections
df_sections = pd.concat([extract_sections_to_dataframe(page) for page in df['Text']], ignore_index=True)
df_sections.to_csv(os.path.join(processed_data_dir, 'pdf_sections.csv'), index=False)
print(df_sections.tail())

                                                title  \
52                                       59) \nFINISH   
53                61) \nSPRINT SESSION CLASSIFICATION   
54                          62) \nRACE CLASSIFICATION   
55  63) \nPODIUM CEREMONY AND POST RACE PRESS CONF...   
56                               64) \nTEAM EQUIPMENT   

                                                chunk  
52  59.1 \nA chequered flag will be the end-of-ses...  
53  61.1 \nThe car placed first will be the one ha...  
54  62.1 \nThe car placed first will be the one ha...  
55  63.1 \nThe drivers finishing the race in 1st, ...  
56  64.1 \nAll equipment used to cool the car on t...  


In [66]:
def extract_appendices_to_dataframe(text: str) -> pd.DataFrame:
    """
    Extracts sections from the text based on patterns like 'NUMBER) A TITLE'
    and returns a DataFrame with 'title' and 'chunk' columns.

    Args:
        text (str): The text to process.

    Returns:
        pd.DataFrame: A DataFrame with 'title' and 'chunk' columns.
    """
    section_pattern = r"APPENDIX\s+\d+\s*\n"
    matches = re.split(section_pattern, text)

    titles = []
    chunks = []

    for i in range(1, len(matches), 2):
        titles.append(matches[i].strip())
        chunks.append(matches[i + 1].strip())

    return pd.DataFrame({'title': titles, 'chunk': chunks})

# Process the extracted text into sections
df_appendices = pd.concat([extract_appendices_to_dataframe(page) for page in df['Text']], ignore_index=True)
df_appendices.to_csv(os.path.join(processed_data_dir, 'pdf_appendices.csv'), index=False)
print(df_appendices.tail())

IndexError: list index out of range

In [59]:
df_sections['length'] = df_sections['chunk'].apply(len)

In [60]:
df_sections.query('length > 32000')

Unnamed: 0,title,chunk,length


# Appendices must be added

title                                 64) \nTEAM EQUIPMENT
chunk    64.1 \nAll equipment used to cool the car on t...
Name: 56, dtype: object