In [16]:
import sys
import os
import pandas as pd
import re 
from langchain_community.document_loaders import PyMuPDFLoader

# Dynamically resolve the base directory for Jupyter notebooks
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '../F1-regulation-RAG-2025'))
sys.path.append(BASE_DIR)

In [31]:
# Ensure the data directory path is resolved correctly
processed_data_dir = os.path.join(BASE_DIR, 'data', 'processed')
os.makedirs(processed_data_dir, exist_ok=True)
processed_data_dir 

'/Users/pierre.muller/Documents/Perso/F1-regulation-RAG-2025/data/processed'

In [32]:
!ls {BASE_DIR}

README.md      [34mexamples[m[m       pyproject.toml uv.lock
[34mdata[m[m           main.py        [34msrc[m[m


In [33]:
def remove_footer(text: str) -> str:
    """
    Removes the footer from the given text.

    Args:
        text (str): The text containing the footer.

    Returns:
        str: The text with the footer removed.
    """
    footer_pattern = r"2025 Formula 1 Sporting Regulations\s+\d+/\d+\s+30 April 2025\s+©2025 Fédération Internationale de l’Automobile\s+Issue 5"
    cleaned_text = re.sub(footer_pattern, '', text)
    return cleaned_text

In [34]:
def extract_pdf_text_to_dataframe(pdf_path: str) -> pd.DataFrame:
    """
    Extracts text from a PDF file and returns it in a pandas DataFrame.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted text, with each page as a row.
    """
    try:
        
        # Load the document
        loader = PyMuPDFLoader(pdf_path,extract_tables='markdown')
        documents = loader.load_and_split()

        pages_text = [remove_footer(doc.page_content) for doc in documents]

        df = pd.DataFrame({'Page': range(1, len(pages_text) + 1), 'Text': pages_text})
        return df


    except Exception as e:
        raise RuntimeError(f"Error processing PDF file: {e}")



In [73]:
pdf_path = os.path.join(BASE_DIR, 'data', 'pdfs', 'FIA 2025 Formula 1 Sporting Regulations - Issue 5 - 2025-04-30.pdf')
df = extract_pdf_text_to_dataframe(pdf_path)
df.to_csv(os.path.join(processed_data_dir, 'pdf_text.csv'), index=False)
print(df['Text'][1])

 
 
1) 
REGULATIONS 
1.1 
The FIA will organise the FIA Formula One World Championship (the Championship) which is the 
property of the FIA and comprises two titles of World Champion, one for drivers and one for 
constructors. It consists of the Formula One Grand Prix races which are included in the Formula 
One calendar and in respect of which the ASNs and organisers have signed organisation 
agreements with the FIA. All the participating parties (FIA, ASNs, organisers, Competitors and 
circuits) undertake to apply as well as observe the rules governing the Championship and must 
hold the appropriate FIA Licences which are issued to drivers, Competitors, officials, organisers 
and circuits. 
1.2 
The final text of these Sporting Regulations shall be the English version which will be used should 
any dispute arise as to their interpretation. Headings in this document are for ease of reference 
only and do not form part of these Sporting Regulations. 
1.3 
These Sporting Regulations app

In [56]:
df.iloc[-1]['Text']

' \n \nAPPENDIX 9 \n \nAPPROVED CHANGES FOR SUBSEQUENT YEARS \n \nConvention: \nDark red text: All changes previously agreed for subsequent years \nDark red highlighted (yellow) text: new changes for subsequent years, approved by the WMSC \n \n1 \nChanges for 2026'

In [57]:
match = re.search(r"APPENDIX[\s\n]+\d+", df.iloc[-1]['Text'])
if match:
    print("Match found:", match.group())
else:
    print("No match found.")

Match found: APPENDIX 9


In [None]:
def extract_sections_to_dataframe(text: str) -> pd.DataFrame:
    """
    Extracts sections from the text based on patterns like 'NUMBER) A TITLE'
    and returns a DataFrame with 'title' and 'chunk' columns.

    Args:
        text (str): The text to process.

    Returns:
        pd.DataFrame: A DataFrame with 'title' and 'chunk' columns.
    """
    # Updated regex to handle cases like "VIRTUAL SAFETY CAR (VSC)"
    # and exclude titles with letters before the number
    section_pattern = r"(?<![A-Z])(\d+\)\s+(?:[A-Z ,\n\-\(\)É]+))\n"
    matches = re.split(section_pattern, text)

    titles = []
    chunks = []

    for i in range(1, len(matches), 2):
        titles.append(matches[i].replace('\n', ' ').strip())  # Clean multi-line titles
        chunks.append(matches[i + 1].strip())

    processed_df = pd.DataFrame({'title': titles, 'chunk': chunks})
    processed_df[['title', 'chunk']] = processed_df[['title', 'chunk']].map(lambda x: x.replace('\n', ' ').strip())

    return processed_df

# Process the extracted text into sections
df_sections = pd.concat([extract_sections_to_dataframe(page) for page in df['Text']], ignore_index=True)
df_sections.to_csv(os.path.join(processed_data_dir, 'pdf_sections.csv'), index=False)
print(df_sections.tail())

                                                title  \
57          60)  POST SPRINT AND POST RACE PARC FERMÉ   
58                 61)  SPRINT SESSION CLASSIFICATION   
59                           62)  RACE CLASSIFICATION   
60  63)  PODIUM CEREMONY AND POST RACE PRESS CONFE...   
61                                64)  TEAM EQUIPMENT   

                                                chunk  
57  60.1  Only those officials charged with superv...  
58  61.1  The car placed first will be the one hav...  
59  62.1  The car placed first will be the one hav...  
60  63.1  The drivers finishing the race in 1st, 2...  
61  64.1  All equipment used to cool the car on th...  


In [93]:
df['Text'][18]

' \n \n17.3 \nAppeals may not be made against decision concerning the following: \n \na) \nPenalties imposed under Articles 54.3a), 54.3b), 54.3c), 54.3d), 54.3e), 54.3f) or 54.3g), \nincluding those imposed during the last three (3) laps or after the end of a sprint session \nor a race. \n \nb) \nAny drop of grid positions imposed under Article 28. \n \nc) \nAny penalty imposed under Article 37.4. \n \nd) \nAny decision taken by the stewards in relation to Article 42. \n \ne) \nAny penalty imposed under Articles 43.5, 44.4 or 58.3. \n \nf) \nAny decision taken by the stewards under Article 4.2. \n17.4 \nPetitions for review shall be made in accordance with the Code and accompanied by a deposit \nof €2000. \n \n18) \nSANCTIONS \n18.1 \nThe stewards may impose the penalties specifically set out in these Sporting Regulations in \naddition to or instead of any other penalties available to them under the Code. \n18.2 \nAny driver who receives five (5) reprimands in the same Championship wi

In [102]:
re.split(r"(\d+\)\s+[A-Z ]+)\n",df['Text'][73])

[' \n \n59.3 \nAfter receiving the end-of-session signal all cars must proceed on the circuit directly to the parc \nfermé without any unnecessary delay, without receiving any object whatsoever and without any \nassistance (except that of the marshals if necessary). \n \nAn exception to Article 26.4 and to the above will be made for the winning driver of the race \nwho may perform an act of celebration before reaching parc fermé, provided any such act: \n \na) \nIs performed safely and does not endanger other drivers or any officials. \n \nb) \nDoes not call into question the legality of his car. \n \nc) \nDoes not delay the podium ceremony. \n \nAny classified car which cannot reach the parc fermé under its own power will be placed under \nthe exclusive control of the marshals who will take the car to the parc fermé. \n \n60) \nPOST SPRINT AND POST RACE PARC FERMÉ \n60.1 \nOnly those officials charged with supervision may enter the parc fermé. No intervention of any \nkind is allowed 

In [115]:
df_sections[~df_sections['title'].str.contains('VOID')]

Unnamed: 0,title,chunk
0,1) REGULATIONS,1.1 The FIA will organise the FIA Formula One...
1,2) GENERAL UNDERTAKING,"2.1 All drivers, Competitors and officials pa..."
2,3) GENERAL CONDITIONS,3.1 It is the Competitor’s responsibility to ...
3,4) LICENCES,"4.1 All drivers, Competitors and officials pa..."
4,5) CHAMPIONSHIP COMPETITIONS,5.1 Competitions are reserved for Formula One...
5,6) WORLD CHAMPIONSHIP,6.1 The Formula One World Championship driver...
6,7) DEAD HEAT,7.1 Prizes and points awarded for all the pos...
7,8) COMPETITORS APPLICATIONS,8.1 Applications to compete in the Championsh...
8,9) CAR LIVERY AND COMPETITION NUMBERS,9.1 Car Livery a) The provisions of the Cod...
9,10) TRACK RUNNING TIME OUTSIDE A COMPETITION,10.1 Testing of Current Cars (TCC) a) Testi...


In [59]:
df_sections['length'] = df_sections['chunk'].apply(len)

In [60]:
df_sections.query('length > 32000')

Unnamed: 0,title,chunk,length


# Appendices must be added

In [None]:
def extract_appendices_to_dataframe(text: str) -> pd.DataFrame:
    """
    Extracts sections from the text based on patterns like 'NUMBER) A TITLE'
    and returns a DataFrame with 'title' and 'chunk' columns.

    Args:
        text (str): The text to process.

    Returns:
        pd.DataFrame: A DataFrame with 'title' and 'chunk' columns.
    """
    section_pattern = r"APPENDIX[\s\n]+\d+"
    matches = re.split(section_pattern, text)
    print(matches)

    titles = []
    chunks = []

    for i in range(1, len(matches), 2):
        titles.append(matches[i].strip())
        print(f"Title {i//2 + 1}: {matches[i].strip()}")
        chunks.append(matches[i + 1].strip())

    return pd.DataFrame({'title': titles, 'chunk': chunks})

# Process the extracted text into sections
df_appendices = pd.concat([extract_appendices_to_dataframe(page) for page in df['Text']], ignore_index=True)
df_appendices.to_csv(os.path.join(processed_data_dir, 'pdf_appendices.csv'), index=False)
print(df_appendices.tail())

title                                 64) \nTEAM EQUIPMENT
chunk    64.1 \nAll equipment used to cool the car on t...
Name: 56, dtype: object

### Token embeddings

In [68]:

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")


In [69]:
df = pd.read_csv(os.path.join(processed_data_dir, 'pdf_sections.csv'))

In [70]:
df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,title,chunk
0,1) REGULATIONS,1.1 The FIA will organise the FIA Formula One...
1,2) GENERAL UNDERTAKING,"2.1 All drivers, Competitors and officials pa..."
2,3) GENERAL CONDITIONS,3.1 It is the Competitor’s responsibility to ...
3,4) LICENCES,"4.1 All drivers, Competitors and officials pa..."
4,5) CHAMPIONSHIP COMPETITIONS,5.1 Competitions are reserved for Formula One...


In [71]:
df.loc[df.isna().sum(axis=1) > 0]

Unnamed: 0,title,chunk
26,29) VOID,
36,41) VOID,


In [25]:
df['title'] = df['title'].apply(lambda x: x.replace('\n', ' ').strip())  # Clean titles
df['chunk'] = df['chunk'].apply(lambda x: x.replace('\n', ' '))  # Clean chunks

AttributeError: 'float' object has no attribute 'replace'